gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 137           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 138     {
 139       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 140
 141       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 142       if (rhs < lhs)
 143         scalar_type = rhs_type;
 144     }
 145
 146   *lhs_size_unit = lhs;
 147   *rhs_size_unit = rhs;
 148   return scalar_type;
 149 }
 150
 151
 152 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 153    tested at run-time.  Return TRUE if DDR was successfully inserted.
 154    Return false if versioning is not supported.  */
 155
 156 static bool
 157 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 158 {
 159   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 160
 161   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 162     return false;
 163
 164   if (!runtime_alias_check_p (ddr, loop,
 165                               optimize_loop_nest_for_speed_p (loop)))
 166     return false;
 167
 168   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 169   return true;
 170 }
 171
 172 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 173
 174 static void
 175 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 176 {
 177   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 178   for (unsigned int i = 0; i < checks.length(); ++i)
 179     if (checks[i] == value)
 180       return;
 181
 182   if (dump_enabled_p ())
 183     {
 184       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 185       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 186       dump_printf (MSG_NOTE, " is nonzero\n");
 187     }
 188   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 189 }
 190
 191 /* Return true if we know that the order of vectorized STMT_A and
 192    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 193    At least one of the statements is a write.  */
 194
 195 static bool
 196 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 197 {
 198   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 199   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 200
 201   /* Single statements are always kept in their original order.  */
 202   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 203       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 204     return true;
 205
 206   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 207      group are emitted at the position of the first scalar load and all
 208      stores in a group are emitted at the position of the last scalar store.
 209      Thus writes will happen no earlier than their current position
 210      (but could happen later) while reads will happen no later than their
 211      current position (but could happen earlier).  Reordering is therefore
 212      only possible if the first access is a write.  */
 213   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 214   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 215 }
 216
 217 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 218    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 219    distances.  These distances are conservatively correct but they don't
 220    reflect a guaranteed dependence.
 221
 222    Return true if this function does all the work necessary to avoid
 223    an alias or false if the caller should use the dependence distances
 224    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 225    the depth of the loop described by LOOP_VINFO and the other arguments
 226    are as for vect_analyze_data_ref_dependence.  */
 227
 228 static bool
 229 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 230                                        loop_vec_info loop_vinfo,
 231                                        int loop_depth, unsigned int *max_vf)
 232 {
 233   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 234   lambda_vector dist_v;
 235   unsigned int i;
 236   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 237     {
 238       int dist = dist_v[loop_depth];
 239       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 240         {
 241           /* If the user asserted safelen >= DIST consecutive iterations
 242              can be executed concurrently, assume independence.
 243
 244              ??? An alternative would be to add the alias check even
 245              in this case, and vectorize the fallback loop with the
 246              maximum VF set to safelen.  However, if the user has
 247              explicitly given a length, it's less likely that that
 248              would be a win.  */
 249           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 250             {
 251               if ((unsigned int) loop->safelen < *max_vf)
 252                 *max_vf = loop->safelen;
 253               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 254               continue;
 255             }
 256
 257           /* For dependence distances of 2 or more, we have the option
 258              of limiting VF or checking for an alias at runtime.
 259              Prefer to check at runtime if we can, to avoid limiting
 260              the VF unnecessarily when the bases are in fact independent.
 261
 262              Note that the alias checks will be removed if the VF ends up
 263              being small enough.  */
 264           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 265         }
 266     }
 267   return true;
 268 }
 269
 270
 271 /* Function vect_analyze_data_ref_dependence.
 272
 273    Return TRUE if there (might) exist a dependence between a memory-reference
 274    DRA and a memory-reference DRB.  When versioning for alias may check a
 275    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 276    the data dependence.  */
 277
 278 static bool
 279 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 280                                   loop_vec_info loop_vinfo,
 281                                   unsigned int *max_vf)
 282 {
 283   unsigned int i;
 284   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 285   struct data_reference *dra = DDR_A (ddr);
 286   struct data_reference *drb = DDR_B (ddr);
 287   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 288   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 289   lambda_vector dist_v;
 290   unsigned int loop_depth;
 291
 292   /* In loop analysis all data references should be vectorizable.  */
 293   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 294       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 295     gcc_unreachable ();
 296
 297   /* Independent data accesses.  */
 298   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 299     return false;
 300
 301   if (dra == drb
 302       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 303     return false;
 304
 305   /* We do not have to consider dependences between accesses that belong
 306      to the same group.  */
 307   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 308       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 309     return false;
 310
 311   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 312      least two scalar iterations, there is always also a true dependence.
 313      As the vectorizer does not re-order loads and stores we can ignore
 314      the anti-dependence if TBAA can disambiguate both DRs similar to the
 315      case with known negative distance anti-dependences (positive
 316      distance anti-dependences would violate TBAA constraints).  */
 317   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 318        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 319       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 320                                  get_alias_set (DR_REF (drb))))
 321     return false;
 322
 323   /* Unknown data dependence.  */
 324   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 325     {
 326       /* If user asserted safelen consecutive iterations can be
 327          executed concurrently, assume independence.  */
 328       if (loop->safelen >= 2)
 329         {
 330           if ((unsigned int) loop->safelen < *max_vf)
 331             *max_vf = loop->safelen;
 332           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 333           return false;
 334         }
 335
 336       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 337           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 338         {
 339           if (dump_enabled_p ())
 340             {
 341               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 342                                "versioning for alias not supported for: "
 343                                "can't determine dependence between ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (dra));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 347               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 348                                  DR_REF (drb));
 349               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 350             }
 351           return true;
 352         }
 353
 354       if (dump_enabled_p ())
 355         {
 356           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 357                            "versioning for alias required: "
 358                            "can't determine dependence between ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 360                              DR_REF (dra));
 361           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 362           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 363                              DR_REF (drb));
 364           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 365         }
 366
 367       /* Add to list of ddrs that need to be tested at run-time.  */
 368       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 369     }
 370
 371   /* Known data dependence.  */
 372   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 373     {
 374       /* If user asserted safelen consecutive iterations can be
 375          executed concurrently, assume independence.  */
 376       if (loop->safelen >= 2)
 377         {
 378           if ((unsigned int) loop->safelen < *max_vf)
 379             *max_vf = loop->safelen;
 380           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 381           return false;
 382         }
 383
 384       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 385           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 386         {
 387           if (dump_enabled_p ())
 388             {
 389               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                                "versioning for alias not supported for: "
 391                                "bad dist vector for ");
 392               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 393                                  DR_REF (dra));
 394               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 395               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 396                                  DR_REF (drb));
 397               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 398             }
 399           return true;
 400         }
 401
 402       if (dump_enabled_p ())
 403         {
 404           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                            "versioning for alias required: "
 406                            "bad dist vector for ");
 407           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 408           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 409           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 410           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411         }
 412       /* Add to list of ddrs that need to be tested at run-time.  */
 413       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 414     }
 415
 416   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 417
 418   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 419       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 420                                                 loop_depth, max_vf))
 421     return false;
 422
 423   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 424     {
 425       int dist = dist_v[loop_depth];
 426
 427       if (dump_enabled_p ())
 428         dump_printf_loc (MSG_NOTE, vect_location,
 429                          "dependence distance  = %d.\n", dist);
 430
 431       if (dist == 0)
 432         {
 433           if (dump_enabled_p ())
 434             {
 435               dump_printf_loc (MSG_NOTE, vect_location,
 436                                "dependence distance == 0 between ");
 437               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 438               dump_printf (MSG_NOTE, " and ");
 439               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 440               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 441             }
 442
 443           /* When we perform grouped accesses and perform implicit CSE
 444              by detecting equal accesses and doing disambiguation with
 445              runtime alias tests like for
 446                 .. = a[i];
 447                 .. = a[i+1];
 448                 a[i] = ..;
 449                 a[i+1] = ..;
 450                 *p = ..;
 451                 .. = a[i];
 452                 .. = a[i+1];
 453              where we will end up loading { a[i], a[i+1] } once, make
 454              sure that inserting group loads before the first load and
 455              stores after the last store will do the right thing.
 456              Similar for groups like
 457                 a[i] = ...;
 458                 ... = a[i];
 459                 a[i+1] = ...;
 460              where loads from the group interleave with the store.  */
 461           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 462             {
 463               if (dump_enabled_p ())
 464                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 465                                  "READ_WRITE dependence in interleaving.\n");
 466               return true;
 467             }
 468
 469           if (loop->safelen < 2)
 470             {
 471               tree indicator = dr_zero_step_indicator (dra);
 472               if (TREE_CODE (indicator) != INTEGER_CST)
 473                 vect_check_nonzero_value (loop_vinfo, indicator);
 474               else if (integer_zerop (indicator))
 475                 {
 476                   if (dump_enabled_p ())
 477                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 478                                  "access also has a zero step\n");
 479                   return true;
 480                 }
 481             }
 482           continue;
 483         }
 484
 485       if (dist > 0 && DDR_REVERSED_P (ddr))
 486         {
 487           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 488              reversed (to make distance vector positive), and the actual
 489              distance is negative.  */
 490           if (dump_enabled_p ())
 491             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 492                              "dependence distance negative.\n");
 493           /* Record a negative dependence distance to later limit the
 494              amount of stmt copying / unrolling we can perform.
 495              Only need to handle read-after-write dependence.  */
 496           if (DR_IS_READ (drb)
 497               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 498                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 499             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 500           continue;
 501         }
 502
 503       unsigned int abs_dist = abs (dist);
 504       if (abs_dist >= 2 && abs_dist < *max_vf)
 505         {
 506           /* The dependence distance requires reduction of the maximal
 507              vectorization factor.  */
 508           *max_vf = abs (dist);
 509           if (dump_enabled_p ())
 510             dump_printf_loc (MSG_NOTE, vect_location,
 511                              "adjusting maximal vectorization factor to %i\n",
 512                              *max_vf);
 513         }
 514
 515       if (abs_dist >= *max_vf)
 516         {
 517           /* Dependence distance does not create dependence, as far as
 518              vectorization is concerned, in this case.  */
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "dependence distance >= VF.\n");
 522           continue;
 523         }
 524
 525       if (dump_enabled_p ())
 526         {
 527           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 528                        "not vectorized, possible dependence "
 529                        "between data-refs ");
 530           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 531           dump_printf (MSG_NOTE,  " and ");
 532           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 533           dump_printf (MSG_NOTE,  "\n");
 534         }
 535
 536       return true;
 537     }
 538
 539   return false;
 540 }
 541
 542 /* Function vect_analyze_data_ref_dependences.
 543
 544    Examine all the data references in the loop, and make sure there do not
 545    exist any data dependences between them.  Set *MAX_VF according to
 546    the maximum vectorization factor the data dependences allow.  */
 547
 548 bool
 549 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 550                                    unsigned int *max_vf)
 551 {
 552   unsigned int i;
 553   struct data_dependence_relation *ddr;
 554
 555   if (dump_enabled_p ())
 556     dump_printf_loc (MSG_NOTE, vect_location,
 557                      "=== vect_analyze_data_ref_dependences ===\n");
 558
 559   LOOP_VINFO_DDRS (loop_vinfo)
 560     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 561              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 562   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 563   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 564   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 565                                 &LOOP_VINFO_DDRS (loop_vinfo),
 566                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 567     return false;
 568
 569   /* For epilogues we either have no aliases or alias versioning
 570      was applied to original loop.  Therefore we may just get max_vf
 571      using VF of original loop.  */
 572   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 573     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 574   else
 575     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 576       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 577         return false;
 578
 579   return true;
 580 }
 581
 582
 583 /* Function vect_slp_analyze_data_ref_dependence.
 584
 585    Return TRUE if there (might) exist a dependence between a memory-reference
 586    DRA and a memory-reference DRB.  When versioning for alias may check a
 587    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 588    the data dependence.  */
 589
 590 static bool
 591 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 592 {
 593   struct data_reference *dra = DDR_A (ddr);
 594   struct data_reference *drb = DDR_B (ddr);
 595
 596   /* We need to check dependences of statements marked as unvectorizable
 597      as well, they still can prohibit vectorization.  */
 598
 599   /* Independent data accesses.  */
 600   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 601     return false;
 602
 603   if (dra == drb)
 604     return false;
 605
 606   /* Read-read is OK.  */
 607   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 608     return false;
 609
 610   /* If dra and drb are part of the same interleaving chain consider
 611      them independent.  */
 612   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 613       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 614           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 615     return false;
 616
 617   /* Unknown data dependence.  */
 618   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 619     {
 620       if  (dump_enabled_p ())
 621         {
 622           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 623                            "can't determine dependence between ");
 624           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 625           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 626           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 627           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 628         }
 629     }
 630   else if (dump_enabled_p ())
 631     {
 632       dump_printf_loc (MSG_NOTE, vect_location,
 633                        "determined dependence between ");
 634       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 635       dump_printf (MSG_NOTE, " and ");
 636       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 637       dump_printf (MSG_NOTE,  "\n");
 638     }
 639
 640   return true;
 641 }
 642
 643
 644 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 645    contain the vector of scalar stores of this instance if we are
 646    disambiguating the loads.  */
 647
 648 static bool
 649 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 650                                    vec<gimple *> stores, gimple *last_store)
 651 {
 652   /* This walks over all stmts involved in the SLP load/store done
 653      in NODE verifying we can sink them up to the last stmt in the
 654      group.  */
 655   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 656   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 657     {
 658       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 659       if (access == last_access)
 660         continue;
 661       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 662       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 663            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 664         {
 665           gimple *stmt = gsi_stmt (gsi);
 666           if (! gimple_vuse (stmt)
 667               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 668             continue;
 669
 670           /* If we couldn't record a (single) data reference for this
 671              stmt we have to give up.  */
 672           /* ???  Here and below if dependence analysis fails we can resort
 673              to the alias oracle which can handle more kinds of stmts.  */
 674           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 675           if (!dr_b)
 676             return false;
 677
 678           bool dependent = false;
 679           /* If we run into a store of this same instance (we've just
 680              marked those) then delay dependence checking until we run
 681              into the last store because this is where it will have
 682              been sunk to (and we verify if we can do that as well).  */
 683           if (gimple_visited_p (stmt))
 684             {
 685               if (stmt != last_store)
 686                 continue;
 687               unsigned i;
 688               gimple *store;
 689               FOR_EACH_VEC_ELT (stores, i, store)
 690                 {
 691                   data_reference *store_dr
 692                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 693                   ddr_p ddr = initialize_data_dependence_relation
 694                                 (dr_a, store_dr, vNULL);
 695                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 696                   free_dependence_relation (ddr);
 697                   if (dependent)
 698                     break;
 699                 }
 700             }
 701           else
 702             {
 703               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 704                                                                dr_b, vNULL);
 705               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 706               free_dependence_relation (ddr);
 707             }
 708           if (dependent)
 709             return false;
 710         }
 711     }
 712   return true;
 713 }
 714
 715
 716 /* Function vect_analyze_data_ref_dependences.
 717
 718    Examine all the data references in the basic-block, and make sure there
 719    do not exist any data dependences between them.  Set *MAX_VF according to
 720    the maximum vectorization factor the data dependences allow.  */
 721
 722 bool
 723 vect_slp_analyze_instance_dependence (slp_instance instance)
 724 {
 725   if (dump_enabled_p ())
 726     dump_printf_loc (MSG_NOTE, vect_location,
 727                      "=== vect_slp_analyze_instance_dependence ===\n");
 728
 729   /* The stores of this instance are at the root of the SLP tree.  */
 730   slp_tree store = SLP_INSTANCE_TREE (instance);
 731   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 732     store = NULL;
 733
 734   /* Verify we can sink stores to the vectorized stmt insert location.  */
 735   gimple *last_store = NULL;
 736   if (store)
 737     {
 738       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 739         return false;
 740
 741       /* Mark stores in this instance and remember the last one.  */
 742       last_store = vect_find_last_scalar_stmt_in_slp (store);
 743       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 744         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 745     }
 746
 747   bool res = true;
 748
 749   /* Verify we can sink loads to the vectorized stmt insert location,
 750      special-casing stores of this instance.  */
 751   slp_tree load;
 752   unsigned int i;
 753   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 754     if (! vect_slp_analyze_node_dependences (instance, load,
 755                                              store
 756                                              ? SLP_TREE_SCALAR_STMTS (store)
 757                                              : vNULL, last_store))
 758       {
 759         res = false;
 760         break;
 761       }
 762
 763   /* Unset the visited flag.  */
 764   if (store)
 765     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 766       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 767
 768   return res;
 769 }
 770
 771 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 772    the statement that contains DRB, which is useful for recording in the
 773    dump file.  */
 774
 775 static void
 776 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 777                             innermost_loop_behavior *drb)
 778 {
 779   bool existed;
 780   innermost_loop_behavior *&entry
 781     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 782   if (!existed || entry->base_alignment < drb->base_alignment)
 783     {
 784       entry = drb;
 785       if (dump_enabled_p ())
 786         {
 787           dump_printf_loc (MSG_NOTE, vect_location,
 788                            "recording new base alignment for ");
 789           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 790           dump_printf (MSG_NOTE, "\n");
 791           dump_printf_loc (MSG_NOTE, vect_location,
 792                            "  alignment:    %d\n", drb->base_alignment);
 793           dump_printf_loc (MSG_NOTE, vect_location,
 794                            "  misalignment: %d\n", drb->base_misalignment);
 795           dump_printf_loc (MSG_NOTE, vect_location,
 796                            "  based on:     ");
 797           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 798         }
 799     }
 800 }
 801
 802 /* If the region we're going to vectorize is reached, all unconditional
 803    data references occur at least once.  We can therefore pool the base
 804    alignment guarantees from each unconditional reference.  Do this by
 805    going through all the data references in VINFO and checking whether
 806    the containing statement makes the reference unconditionally.  If so,
 807    record the alignment of the base address in VINFO so that it can be
 808    used for all other references with the same base.  */
 809
 810 void
 811 vect_record_base_alignments (vec_info *vinfo)
 812 {
 813   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 814   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 815   data_reference *dr;
 816   unsigned int i;
 817   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 818     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 819       {
 820         gimple *stmt = DR_STMT (dr);
 821         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 822
 823         /* If DR is nested in the loop that is being vectorized, we can also
 824            record the alignment of the base wrt the outer loop.  */
 825         if (loop && nested_in_vect_loop_p (loop, stmt))
 826           {
 827             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 828             vect_record_base_alignment
 829               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 830           }
 831       }
 832 }
 833
 834 /* Return the target alignment for the vectorized form of DR.  */
 835
 836 static unsigned int
 837 vect_calculate_target_alignment (struct data_reference *dr)
 838 {
 839   gimple *stmt = DR_STMT (dr);
 840   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 841   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 842   return targetm.vectorize.preferred_vector_alignment (vectype);
 843 }
 844
 845 /* Function vect_compute_data_ref_alignment
 846
 847    Compute the misalignment of the data reference DR.
 848
 849    Output:
 850    1. If during the misalignment computation it is found that the data reference
 851       cannot be vectorized then false is returned.
 852    2. DR_MISALIGNMENT (DR) is defined.
 853
 854    FOR NOW: No analysis is actually performed. Misalignment is calculated
 855    only for trivial cases. TODO.  */
 856
 857 bool
 858 vect_compute_data_ref_alignment (struct data_reference *dr)
 859 {
 860   gimple *stmt = DR_STMT (dr);
 861   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 862   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 863   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 864   struct loop *loop = NULL;
 865   tree ref = DR_REF (dr);
 866   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 867
 868   if (dump_enabled_p ())
 869     dump_printf_loc (MSG_NOTE, vect_location,
 870                      "vect_compute_data_ref_alignment:\n");
 871
 872   if (loop_vinfo)
 873     loop = LOOP_VINFO_LOOP (loop_vinfo);
 874
 875   /* Initialize misalignment to unknown.  */
 876   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 877
 878   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 879   bool step_preserves_misalignment_p;
 880
 881   unsigned HOST_WIDE_INT vector_alignment
 882     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 883   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 884
 885   /* No step for BB vectorization.  */
 886   if (!loop)
 887     {
 888       gcc_assert (integer_zerop (drb->step));
 889       step_preserves_misalignment_p = true;
 890     }
 891
 892   /* In case the dataref is in an inner-loop of the loop that is being
 893      vectorized (LOOP), we use the base and misalignment information
 894      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 895      stays the same throughout the execution of the inner-loop, which is why
 896      we have to check that the stride of the dataref in the inner-loop evenly
 897      divides by the vector alignment.  */
 898   else if (nested_in_vect_loop_p (loop, stmt))
 899     {
 900       step_preserves_misalignment_p
 901         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 902
 903       if (dump_enabled_p ())
 904         {
 905           if (step_preserves_misalignment_p)
 906             dump_printf_loc (MSG_NOTE, vect_location,
 907                              "inner step divides the vector alignment.\n");
 908           else
 909             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 910                              "inner step doesn't divide the vector"
 911                              " alignment.\n");
 912         }
 913     }
 914
 915   /* Similarly we can only use base and misalignment information relative to
 916      an innermost loop if the misalignment stays the same throughout the
 917      execution of the loop.  As above, this is the case if the stride of
 918      the dataref evenly divides by the alignment.  */
 919   else
 920     {
 921       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 922       step_preserves_misalignment_p
 923         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 924
 925       if (!step_preserves_misalignment_p && dump_enabled_p ())
 926         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 927                          "step doesn't divide the vector alignment.\n");
 928     }
 929
 930   unsigned int base_alignment = drb->base_alignment;
 931   unsigned int base_misalignment = drb->base_misalignment;
 932
 933   /* Calculate the maximum of the pooled base address alignment and the
 934      alignment that we can compute for DR itself.  */
 935   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 936   if (entry && base_alignment < (*entry)->base_alignment)
 937     {
 938       base_alignment = (*entry)->base_alignment;
 939       base_misalignment = (*entry)->base_misalignment;
 940     }
 941
 942   if (drb->offset_alignment < vector_alignment
 943       || !step_preserves_misalignment_p
 944       /* We need to know whether the step wrt the vectorized loop is
 945          negative when computing the starting misalignment below.  */
 946       || TREE_CODE (drb->step) != INTEGER_CST)
 947     {
 948       if (dump_enabled_p ())
 949         {
 950           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 951                            "Unknown alignment for access: ");
 952           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 953           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 954         }
 955       return true;
 956     }
 957
 958   if (base_alignment < vector_alignment)
 959     {
 960       tree base = drb->base_address;
 961       if (TREE_CODE (base) == ADDR_EXPR)
 962         base = TREE_OPERAND (base, 0);
 963       if (!vect_can_force_dr_alignment_p (base,
 964                                           vector_alignment * BITS_PER_UNIT))
 965         {
 966           if (dump_enabled_p ())
 967             {
 968               dump_printf_loc (MSG_NOTE, vect_location,
 969                                "can't force alignment of ref: ");
 970               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 971               dump_printf (MSG_NOTE, "\n");
 972             }
 973           return true;
 974         }
 975
 976       /* Force the alignment of the decl.
 977          NOTE: This is the only change to the code we make during
 978          the analysis phase, before deciding to vectorize the loop.  */
 979       if (dump_enabled_p ())
 980         {
 981           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 982           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 983           dump_printf (MSG_NOTE, "\n");
 984         }
 985
 986       DR_VECT_AUX (dr)->base_decl = base;
 987       DR_VECT_AUX (dr)->base_misaligned = true;
 988       base_misalignment = 0;
 989     }
 990   poly_int64 misalignment
 991     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 992
 993   /* If this is a backward running DR then first access in the larger
 994      vectype actually is N-1 elements before the address in the DR.
 995      Adjust misalign accordingly.  */
 996   if (tree_int_cst_sgn (drb->step) < 0)
 997     /* PLUS because STEP is negative.  */
 998     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 999                      * TREE_INT_CST_LOW (drb->step));
1000
1001   unsigned int const_misalignment;
1002   if (!known_misalignment (misalignment, vector_alignment,
1003                            &const_misalignment))
1004     {
1005       if (dump_enabled_p ())
1006         {
1007           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1008                            "Non-constant misalignment for access: ");
1009           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1010           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1011         }
1012       return true;
1013     }
1014
1015   SET_DR_MISALIGNMENT (dr, const_misalignment);
1016
1017   if (dump_enabled_p ())
1018     {
1019       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1021       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1022       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1023     }
1024
1025   return true;
1026 }
1027
1028 /* Function vect_update_misalignment_for_peel.
1029    Sets DR's misalignment
1030    - to 0 if it has the same alignment as DR_PEEL,
1031    - to the misalignment computed using NPEEL if DR's salignment is known,
1032    - to -1 (unknown) otherwise.
1033
1034    DR - the data reference whose misalignment is to be adjusted.
1035    DR_PEEL - the data reference whose misalignment is being made
1036              zero in the vector loop by the peel.
1037    NPEEL - the number of iterations in the peel loop if the misalignment
1038            of DR_PEEL is known at compile time.  */
1039
1040 static void
1041 vect_update_misalignment_for_peel (struct data_reference *dr,
1042                                    struct data_reference *dr_peel, int npeel)
1043 {
1044   unsigned int i;
1045   vec<dr_p> same_aligned_drs;
1046   struct data_reference *current_dr;
1047   int dr_size = vect_get_scalar_dr_size (dr);
1048   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1049   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1050   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1051
1052  /* For interleaved data accesses the step in the loop must be multiplied by
1053      the size of the interleaving group.  */
1054   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1055     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1056   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1057     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1058
1059   /* It can be assumed that the data refs with the same alignment as dr_peel
1060      are aligned in the vector loop.  */
1061   same_aligned_drs
1062     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1063   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1064     {
1065       if (current_dr != dr)
1066         continue;
1067       gcc_assert (!known_alignment_for_access_p (dr)
1068                   || !known_alignment_for_access_p (dr_peel)
1069                   || (DR_MISALIGNMENT (dr) / dr_size
1070                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1071       SET_DR_MISALIGNMENT (dr, 0);
1072       return;
1073     }
1074
1075   if (known_alignment_for_access_p (dr)
1076       && known_alignment_for_access_p (dr_peel))
1077     {
1078       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1079       int misal = DR_MISALIGNMENT (dr);
1080       misal += negative ? -npeel * dr_size : npeel * dr_size;
1081       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1082       SET_DR_MISALIGNMENT (dr, misal);
1083       return;
1084     }
1085
1086   if (dump_enabled_p ())
1087     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1088                      "to unknown (-1).\n");
1089   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1090 }
1091
1092
1093 /* Function verify_data_ref_alignment
1094
1095    Return TRUE if DR can be handled with respect to alignment.  */
1096
1097 static bool
1098 verify_data_ref_alignment (data_reference_p dr)
1099 {
1100   enum dr_alignment_support supportable_dr_alignment
1101     = vect_supportable_dr_alignment (dr, false);
1102   if (!supportable_dr_alignment)
1103     {
1104       if (dump_enabled_p ())
1105         {
1106           if (DR_IS_READ (dr))
1107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1108                              "not vectorized: unsupported unaligned load.");
1109           else
1110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1111                              "not vectorized: unsupported unaligned "
1112                              "store.");
1113
1114           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1115                              DR_REF (dr));
1116           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1117         }
1118       return false;
1119     }
1120
1121   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1122     dump_printf_loc (MSG_NOTE, vect_location,
1123                      "Vectorizing an unaligned access.\n");
1124
1125   return true;
1126 }
1127
1128 /* Function vect_verify_datarefs_alignment
1129
1130    Return TRUE if all data references in the loop can be
1131    handled with respect to alignment.  */
1132
1133 bool
1134 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1135 {
1136   vec<data_reference_p> datarefs = vinfo->datarefs;
1137   struct data_reference *dr;
1138   unsigned int i;
1139
1140   FOR_EACH_VEC_ELT (datarefs, i, dr)
1141     {
1142       gimple *stmt = DR_STMT (dr);
1143       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1144
1145       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1146         continue;
1147
1148       /* For interleaving, only the alignment of the first access matters.   */
1149       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1150           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1151         continue;
1152
1153       /* Strided accesses perform only component accesses, alignment is
1154          irrelevant for them.  */
1155       if (STMT_VINFO_STRIDED_P (stmt_info)
1156           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1157         continue;
1158
1159       if (! verify_data_ref_alignment (dr))
1160         return false;
1161     }
1162
1163   return true;
1164 }
1165
1166 /* Given an memory reference EXP return whether its alignment is less
1167    than its size.  */
1168
1169 static bool
1170 not_size_aligned (tree exp)
1171 {
1172   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1173     return true;
1174
1175   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1176           > get_object_alignment (exp));
1177 }
1178
1179 /* Function vector_alignment_reachable_p
1180
1181    Return true if vector alignment for DR is reachable by peeling
1182    a few loop iterations.  Return false otherwise.  */
1183
1184 static bool
1185 vector_alignment_reachable_p (struct data_reference *dr)
1186 {
1187   gimple *stmt = DR_STMT (dr);
1188   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1189   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1190
1191   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1192     {
1193       /* For interleaved access we peel only if number of iterations in
1194          the prolog loop ({VF - misalignment}), is a multiple of the
1195          number of the interleaved accesses.  */
1196       int elem_size, mis_in_elements;
1197
1198       /* FORNOW: handle only known alignment.  */
1199       if (!known_alignment_for_access_p (dr))
1200         return false;
1201
1202       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1203       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1204       elem_size = vector_element_size (vector_size, nelements);
1205       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1206
1207       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1208         return false;
1209     }
1210
1211   /* If misalignment is known at the compile time then allow peeling
1212      only if natural alignment is reachable through peeling.  */
1213   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1214     {
1215       HOST_WIDE_INT elmsize =
1216                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1217       if (dump_enabled_p ())
1218         {
1219           dump_printf_loc (MSG_NOTE, vect_location,
1220                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1221           dump_printf (MSG_NOTE,
1222                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1223         }
1224       if (DR_MISALIGNMENT (dr) % elmsize)
1225         {
1226           if (dump_enabled_p ())
1227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228                              "data size does not divide the misalignment.\n");
1229           return false;
1230         }
1231     }
1232
1233   if (!known_alignment_for_access_p (dr))
1234     {
1235       tree type = TREE_TYPE (DR_REF (dr));
1236       bool is_packed = not_size_aligned (DR_REF (dr));
1237       if (dump_enabled_p ())
1238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239                          "Unknown misalignment, %snaturally aligned\n",
1240                          is_packed ? "not " : "");
1241       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1242     }
1243
1244   return true;
1245 }
1246
1247
1248 /* Calculate the cost of the memory access represented by DR.  */
1249
1250 static void
1251 vect_get_data_access_cost (struct data_reference *dr,
1252                            unsigned int *inside_cost,
1253                            unsigned int *outside_cost,
1254                            stmt_vector_for_cost *body_cost_vec)
1255 {
1256   gimple *stmt = DR_STMT (dr);
1257   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1258   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1259   int ncopies;
1260
1261   if (PURE_SLP_STMT (stmt_info))
1262     ncopies = 1;
1263   else
1264     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1265
1266   if (DR_IS_READ (dr))
1267     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1268                         NULL, body_cost_vec, false);
1269   else
1270     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1271
1272   if (dump_enabled_p ())
1273     dump_printf_loc (MSG_NOTE, vect_location,
1274                      "vect_get_data_access_cost: inside_cost = %d, "
1275                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1276 }
1277
1278
1279 typedef struct _vect_peel_info
1280 {
1281   struct data_reference *dr;
1282   int npeel;
1283   unsigned int count;
1284 } *vect_peel_info;
1285
1286 typedef struct _vect_peel_extended_info
1287 {
1288   struct _vect_peel_info peel_info;
1289   unsigned int inside_cost;
1290   unsigned int outside_cost;
1291 } *vect_peel_extended_info;
1292
1293
1294 /* Peeling hashtable helpers.  */
1295
1296 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1297 {
1298   static inline hashval_t hash (const _vect_peel_info *);
1299   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1300 };
1301
1302 inline hashval_t
1303 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1304 {
1305   return (hashval_t) peel_info->npeel;
1306 }
1307
1308 inline bool
1309 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1310 {
1311   return (a->npeel == b->npeel);
1312 }
1313
1314
1315 /* Insert DR into peeling hash table with NPEEL as key.  */
1316
1317 static void
1318 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1319                           loop_vec_info loop_vinfo, struct data_reference *dr,
1320                           int npeel)
1321 {
1322   struct _vect_peel_info elem, *slot;
1323   _vect_peel_info **new_slot;
1324   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1325
1326   elem.npeel = npeel;
1327   slot = peeling_htab->find (&elem);
1328   if (slot)
1329     slot->count++;
1330   else
1331     {
1332       slot = XNEW (struct _vect_peel_info);
1333       slot->npeel = npeel;
1334       slot->dr = dr;
1335       slot->count = 1;
1336       new_slot = peeling_htab->find_slot (slot, INSERT);
1337       *new_slot = slot;
1338     }
1339
1340   if (!supportable_dr_alignment
1341       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1342     slot->count += VECT_MAX_COST;
1343 }
1344
1345
1346 /* Traverse peeling hash table to find peeling option that aligns maximum
1347    number of data accesses.  */
1348
1349 int
1350 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1351                                      _vect_peel_extended_info *max)
1352 {
1353   vect_peel_info elem = *slot;
1354
1355   if (elem->count > max->peel_info.count
1356       || (elem->count == max->peel_info.count
1357           && max->peel_info.npeel > elem->npeel))
1358     {
1359       max->peel_info.npeel = elem->npeel;
1360       max->peel_info.count = elem->count;
1361       max->peel_info.dr = elem->dr;
1362     }
1363
1364   return 1;
1365 }
1366
1367 /* Get the costs of peeling NPEEL iterations checking data access costs
1368    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1369    misalignment will be zero after peeling.  */
1370
1371 static void
1372 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1373                                 struct data_reference *dr0,
1374                                 unsigned int *inside_cost,
1375                                 unsigned int *outside_cost,
1376                                 stmt_vector_for_cost *body_cost_vec,
1377                                 unsigned int npeel,
1378                                 bool unknown_misalignment)
1379 {
1380   unsigned i;
1381   data_reference *dr;
1382
1383   FOR_EACH_VEC_ELT (datarefs, i, dr)
1384     {
1385       gimple *stmt = DR_STMT (dr);
1386       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1387       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1388         continue;
1389
1390       /* For interleaving, only the alignment of the first access
1391          matters.  */
1392       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1393           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1394         continue;
1395
1396       /* Strided accesses perform only component accesses, alignment is
1397          irrelevant for them.  */
1398       if (STMT_VINFO_STRIDED_P (stmt_info)
1399           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1400         continue;
1401
1402       int save_misalignment;
1403       save_misalignment = DR_MISALIGNMENT (dr);
1404       if (npeel == 0)
1405         ;
1406       else if (unknown_misalignment && dr == dr0)
1407         SET_DR_MISALIGNMENT (dr, 0);
1408       else
1409         vect_update_misalignment_for_peel (dr, dr0, npeel);
1410       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1411                                  body_cost_vec);
1412       SET_DR_MISALIGNMENT (dr, save_misalignment);
1413     }
1414 }
1415
1416 /* Traverse peeling hash table and calculate cost for each peeling option.
1417    Find the one with the lowest cost.  */
1418
1419 int
1420 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1421                                    _vect_peel_extended_info *min)
1422 {
1423   vect_peel_info elem = *slot;
1424   int dummy;
1425   unsigned int inside_cost = 0, outside_cost = 0;
1426   gimple *stmt = DR_STMT (elem->dr);
1427   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1428   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1429   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1430                        epilogue_cost_vec;
1431
1432   prologue_cost_vec.create (2);
1433   body_cost_vec.create (2);
1434   epilogue_cost_vec.create (2);
1435
1436   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1437                                   elem->dr, &inside_cost, &outside_cost,
1438                                   &body_cost_vec, elem->npeel, false);
1439
1440   body_cost_vec.release ();
1441
1442   outside_cost += vect_get_known_peeling_cost
1443     (loop_vinfo, elem->npeel, &dummy,
1444      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1445      &prologue_cost_vec, &epilogue_cost_vec);
1446
1447   /* Prologue and epilogue costs are added to the target model later.
1448      These costs depend only on the scalar iteration cost, the
1449      number of peeling iterations finally chosen, and the number of
1450      misaligned statements.  So discard the information found here.  */
1451   prologue_cost_vec.release ();
1452   epilogue_cost_vec.release ();
1453
1454   if (inside_cost < min->inside_cost
1455       || (inside_cost == min->inside_cost
1456           && outside_cost < min->outside_cost))
1457     {
1458       min->inside_cost = inside_cost;
1459       min->outside_cost = outside_cost;
1460       min->peel_info.dr = elem->dr;
1461       min->peel_info.npeel = elem->npeel;
1462       min->peel_info.count = elem->count;
1463     }
1464
1465   return 1;
1466 }
1467
1468
1469 /* Choose best peeling option by traversing peeling hash table and either
1470    choosing an option with the lowest cost (if cost model is enabled) or the
1471    option that aligns as many accesses as possible.  */
1472
1473 static struct _vect_peel_extended_info
1474 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1475                                        loop_vec_info loop_vinfo)
1476 {
1477    struct _vect_peel_extended_info res;
1478
1479    res.peel_info.dr = NULL;
1480
1481    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1482      {
1483        res.inside_cost = INT_MAX;
1484        res.outside_cost = INT_MAX;
1485        peeling_htab->traverse <_vect_peel_extended_info *,
1486                                vect_peeling_hash_get_lowest_cost> (&res);
1487      }
1488    else
1489      {
1490        res.peel_info.count = 0;
1491        peeling_htab->traverse <_vect_peel_extended_info *,
1492                                vect_peeling_hash_get_most_frequent> (&res);
1493        res.inside_cost = 0;
1494        res.outside_cost = 0;
1495      }
1496
1497    return res;
1498 }
1499
1500 /* Return true if the new peeling NPEEL is supported.  */
1501
1502 static bool
1503 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1504                           unsigned npeel)
1505 {
1506   unsigned i;
1507   struct data_reference *dr = NULL;
1508   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1509   gimple *stmt;
1510   stmt_vec_info stmt_info;
1511   enum dr_alignment_support supportable_dr_alignment;
1512
1513   /* Ensure that all data refs can be vectorized after the peel.  */
1514   FOR_EACH_VEC_ELT (datarefs, i, dr)
1515     {
1516       int save_misalignment;
1517
1518       if (dr == dr0)
1519         continue;
1520
1521       stmt = DR_STMT (dr);
1522       stmt_info = vinfo_for_stmt (stmt);
1523       /* For interleaving, only the alignment of the first access
1524          matters.  */
1525       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1526           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1527         continue;
1528
1529       /* Strided accesses perform only component accesses, alignment is
1530          irrelevant for them.  */
1531       if (STMT_VINFO_STRIDED_P (stmt_info)
1532           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1533         continue;
1534
1535       save_misalignment = DR_MISALIGNMENT (dr);
1536       vect_update_misalignment_for_peel (dr, dr0, npeel);
1537       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1538       SET_DR_MISALIGNMENT (dr, save_misalignment);
1539
1540       if (!supportable_dr_alignment)
1541         return false;
1542     }
1543
1544   return true;
1545 }
1546
1547 /* Function vect_enhance_data_refs_alignment
1548
1549    This pass will use loop versioning and loop peeling in order to enhance
1550    the alignment of data references in the loop.
1551
1552    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1553    original loop is to be vectorized.  Any other loops that are created by
1554    the transformations performed in this pass - are not supposed to be
1555    vectorized.  This restriction will be relaxed.
1556
1557    This pass will require a cost model to guide it whether to apply peeling
1558    or versioning or a combination of the two.  For example, the scheme that
1559    intel uses when given a loop with several memory accesses, is as follows:
1560    choose one memory access ('p') which alignment you want to force by doing
1561    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1562    other accesses are not necessarily aligned, or (2) use loop versioning to
1563    generate one loop in which all accesses are aligned, and another loop in
1564    which only 'p' is necessarily aligned.
1565
1566    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1567    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1568    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1569
1570    Devising a cost model is the most critical aspect of this work.  It will
1571    guide us on which access to peel for, whether to use loop versioning, how
1572    many versions to create, etc.  The cost model will probably consist of
1573    generic considerations as well as target specific considerations (on
1574    powerpc for example, misaligned stores are more painful than misaligned
1575    loads).
1576
1577    Here are the general steps involved in alignment enhancements:
1578
1579      -- original loop, before alignment analysis:
1580         for (i=0; i<N; i++){
1581           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1582           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1583         }
1584
1585      -- After vect_compute_data_refs_alignment:
1586         for (i=0; i<N; i++){
1587           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1588           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1589         }
1590
1591      -- Possibility 1: we do loop versioning:
1592      if (p is aligned) {
1593         for (i=0; i<N; i++){    # loop 1A
1594           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1595           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1596         }
1597      }
1598      else {
1599         for (i=0; i<N; i++){    # loop 1B
1600           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1601           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1602         }
1603      }
1604
1605      -- Possibility 2: we do loop peeling:
1606      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1607         x = q[i];
1608         p[i] = y;
1609      }
1610      for (i = 3; i < N; i++){   # loop 2A
1611         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1612         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1613      }
1614
1615      -- Possibility 3: combination of loop peeling and versioning:
1616      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1617         x = q[i];
1618         p[i] = y;
1619      }
1620      if (p is aligned) {
1621         for (i = 3; i<N; i++){  # loop 3A
1622           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1623           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1624         }
1625      }
1626      else {
1627         for (i = 3; i<N; i++){  # loop 3B
1628           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1629           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1630         }
1631      }
1632
1633      These loops are later passed to loop_transform to be vectorized.  The
1634      vectorizer will use the alignment information to guide the transformation
1635      (whether to generate regular loads/stores, or with special handling for
1636      misalignment).  */
1637
1638 bool
1639 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1640 {
1641   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1642   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1643   enum dr_alignment_support supportable_dr_alignment;
1644   struct data_reference *dr0 = NULL, *first_store = NULL;
1645   struct data_reference *dr;
1646   unsigned int i, j;
1647   bool do_peeling = false;
1648   bool do_versioning = false;
1649   bool stat;
1650   gimple *stmt;
1651   stmt_vec_info stmt_info;
1652   unsigned int npeel = 0;
1653   bool one_misalignment_known = false;
1654   bool one_misalignment_unknown = false;
1655   bool one_dr_unsupportable = false;
1656   struct data_reference *unsupportable_dr = NULL;
1657   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1658   unsigned possible_npeel_number = 1;
1659   tree vectype;
1660   unsigned int mis, same_align_drs_max = 0;
1661   hash_table<peel_info_hasher> peeling_htab (1);
1662
1663   if (dump_enabled_p ())
1664     dump_printf_loc (MSG_NOTE, vect_location,
1665                      "=== vect_enhance_data_refs_alignment ===\n");
1666
1667   /* Reset data so we can safely be called multiple times.  */
1668   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1669   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1670
1671   /* While cost model enhancements are expected in the future, the high level
1672      view of the code at this time is as follows:
1673
1674      A) If there is a misaligned access then see if peeling to align
1675         this access can make all data references satisfy
1676         vect_supportable_dr_alignment.  If so, update data structures
1677         as needed and return true.
1678
1679      B) If peeling wasn't possible and there is a data reference with an
1680         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1681         then see if loop versioning checks can be used to make all data
1682         references satisfy vect_supportable_dr_alignment.  If so, update
1683         data structures as needed and return true.
1684
1685      C) If neither peeling nor versioning were successful then return false if
1686         any data reference does not satisfy vect_supportable_dr_alignment.
1687
1688      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1689
1690      Note, Possibility 3 above (which is peeling and versioning together) is not
1691      being done at this time.  */
1692
1693   /* (1) Peeling to force alignment.  */
1694
1695   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1696      Considerations:
1697      + How many accesses will become aligned due to the peeling
1698      - How many accesses will become unaligned due to the peeling,
1699        and the cost of misaligned accesses.
1700      - The cost of peeling (the extra runtime checks, the increase
1701        in code size).  */
1702
1703   FOR_EACH_VEC_ELT (datarefs, i, dr)
1704     {
1705       stmt = DR_STMT (dr);
1706       stmt_info = vinfo_for_stmt (stmt);
1707
1708       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1709         continue;
1710
1711       /* For interleaving, only the alignment of the first access
1712          matters.  */
1713       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1714           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1715         continue;
1716
1717       /* For invariant accesses there is nothing to enhance.  */
1718       if (integer_zerop (DR_STEP (dr)))
1719         continue;
1720
1721       /* Strided accesses perform only component accesses, alignment is
1722          irrelevant for them.  */
1723       if (STMT_VINFO_STRIDED_P (stmt_info)
1724           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1725         continue;
1726
1727       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1728       do_peeling = vector_alignment_reachable_p (dr);
1729       if (do_peeling)
1730         {
1731           if (known_alignment_for_access_p (dr))
1732             {
1733               unsigned int npeel_tmp = 0;
1734               bool negative = tree_int_cst_compare (DR_STEP (dr),
1735                                                     size_zero_node) < 0;
1736
1737               vectype = STMT_VINFO_VECTYPE (stmt_info);
1738               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1739               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1740               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1741               if (DR_MISALIGNMENT (dr) != 0)
1742                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1743
1744               /* For multiple types, it is possible that the bigger type access
1745                  will have more than one peeling option.  E.g., a loop with two
1746                  types: one of size (vector size / 4), and the other one of
1747                  size (vector size / 8).  Vectorization factor will 8.  If both
1748                  accesses are misaligned by 3, the first one needs one scalar
1749                  iteration to be aligned, and the second one needs 5.  But the
1750                  first one will be aligned also by peeling 5 scalar
1751                  iterations, and in that case both accesses will be aligned.
1752                  Hence, except for the immediate peeling amount, we also want
1753                  to try to add full vector size, while we don't exceed
1754                  vectorization factor.
1755                  We do this automatically for cost model, since we calculate
1756                  cost for every peeling option.  */
1757               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1758                 {
1759                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1760                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1761                   possible_npeel_number
1762                     = vect_get_num_vectors (nscalars, vectype);
1763
1764                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1765                      allow peeling NELEMENTS.  */
1766                   if (DR_MISALIGNMENT (dr) == 0)
1767                     possible_npeel_number++;
1768                 }
1769
1770               /* Save info about DR in the hash table.  Also include peeling
1771                  amounts according to the explanation above.  */
1772               for (j = 0; j < possible_npeel_number; j++)
1773                 {
1774                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1775                                             dr, npeel_tmp);
1776                   npeel_tmp += target_align / dr_size;
1777                 }
1778
1779               one_misalignment_known = true;
1780             }
1781           else
1782             {
1783               /* If we don't know any misalignment values, we prefer
1784                  peeling for data-ref that has the maximum number of data-refs
1785                  with the same alignment, unless the target prefers to align
1786                  stores over load.  */
1787               unsigned same_align_drs
1788                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1789               if (!dr0
1790                   || same_align_drs_max < same_align_drs)
1791                 {
1792                   same_align_drs_max = same_align_drs;
1793                   dr0 = dr;
1794                 }
1795               /* For data-refs with the same number of related
1796                  accesses prefer the one where the misalign
1797                  computation will be invariant in the outermost loop.  */
1798               else if (same_align_drs_max == same_align_drs)
1799                 {
1800                   struct loop *ivloop0, *ivloop;
1801                   ivloop0 = outermost_invariant_loop_for_expr
1802                     (loop, DR_BASE_ADDRESS (dr0));
1803                   ivloop = outermost_invariant_loop_for_expr
1804                     (loop, DR_BASE_ADDRESS (dr));
1805                   if ((ivloop && !ivloop0)
1806                       || (ivloop && ivloop0
1807                           && flow_loop_nested_p (ivloop, ivloop0)))
1808                     dr0 = dr;
1809                 }
1810
1811               one_misalignment_unknown = true;
1812
1813               /* Check for data refs with unsupportable alignment that
1814                  can be peeled.  */
1815               if (!supportable_dr_alignment)
1816               {
1817                 one_dr_unsupportable = true;
1818                 unsupportable_dr = dr;
1819               }
1820
1821               if (!first_store && DR_IS_WRITE (dr))
1822                 first_store = dr;
1823             }
1824         }
1825       else
1826         {
1827           if (!aligned_access_p (dr))
1828             {
1829               if (dump_enabled_p ())
1830                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831                                  "vector alignment may not be reachable\n");
1832               break;
1833             }
1834         }
1835     }
1836
1837   /* Check if we can possibly peel the loop.  */
1838   if (!vect_can_advance_ivs_p (loop_vinfo)
1839       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1840       || loop->inner)
1841     do_peeling = false;
1842
1843   struct _vect_peel_extended_info peel_for_known_alignment;
1844   struct _vect_peel_extended_info peel_for_unknown_alignment;
1845   struct _vect_peel_extended_info best_peel;
1846
1847   peel_for_unknown_alignment.inside_cost = INT_MAX;
1848   peel_for_unknown_alignment.outside_cost = INT_MAX;
1849   peel_for_unknown_alignment.peel_info.count = 0;
1850
1851   if (do_peeling
1852       && one_misalignment_unknown)
1853     {
1854       /* Check if the target requires to prefer stores over loads, i.e., if
1855          misaligned stores are more expensive than misaligned loads (taking
1856          drs with same alignment into account).  */
1857       unsigned int load_inside_cost = 0;
1858       unsigned int load_outside_cost = 0;
1859       unsigned int store_inside_cost = 0;
1860       unsigned int store_outside_cost = 0;
1861       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1862
1863       stmt_vector_for_cost dummy;
1864       dummy.create (2);
1865       vect_get_peeling_costs_all_drs (datarefs, dr0,
1866                                       &load_inside_cost,
1867                                       &load_outside_cost,
1868                                       &dummy, estimated_npeels, true);
1869       dummy.release ();
1870
1871       if (first_store)
1872         {
1873           dummy.create (2);
1874           vect_get_peeling_costs_all_drs (datarefs, first_store,
1875                                           &store_inside_cost,
1876                                           &store_outside_cost,
1877                                           &dummy, estimated_npeels, true);
1878           dummy.release ();
1879         }
1880       else
1881         {
1882           store_inside_cost = INT_MAX;
1883           store_outside_cost = INT_MAX;
1884         }
1885
1886       if (load_inside_cost > store_inside_cost
1887           || (load_inside_cost == store_inside_cost
1888               && load_outside_cost > store_outside_cost))
1889         {
1890           dr0 = first_store;
1891           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1892           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1893         }
1894       else
1895         {
1896           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1897           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1898         }
1899
1900       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1901       prologue_cost_vec.create (2);
1902       epilogue_cost_vec.create (2);
1903
1904       int dummy2;
1905       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1906         (loop_vinfo, estimated_npeels, &dummy2,
1907          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1908          &prologue_cost_vec, &epilogue_cost_vec);
1909
1910       prologue_cost_vec.release ();
1911       epilogue_cost_vec.release ();
1912
1913       peel_for_unknown_alignment.peel_info.count = 1
1914         + STMT_VINFO_SAME_ALIGN_REFS
1915         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1916     }
1917
1918   peel_for_unknown_alignment.peel_info.npeel = 0;
1919   peel_for_unknown_alignment.peel_info.dr = dr0;
1920
1921   best_peel = peel_for_unknown_alignment;
1922
1923   peel_for_known_alignment.inside_cost = INT_MAX;
1924   peel_for_known_alignment.outside_cost = INT_MAX;
1925   peel_for_known_alignment.peel_info.count = 0;
1926   peel_for_known_alignment.peel_info.dr = NULL;
1927
1928   if (do_peeling && one_misalignment_known)
1929     {
1930       /* Peeling is possible, but there is no data access that is not supported
1931          unless aligned.  So we try to choose the best possible peeling from
1932          the hash table.  */
1933       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1934         (&peeling_htab, loop_vinfo);
1935     }
1936
1937   /* Compare costs of peeling for known and unknown alignment. */
1938   if (peel_for_known_alignment.peel_info.dr != NULL
1939       && peel_for_unknown_alignment.inside_cost
1940       >= peel_for_known_alignment.inside_cost)
1941     {
1942       best_peel = peel_for_known_alignment;
1943
1944       /* If the best peeling for known alignment has NPEEL == 0, perform no
1945          peeling at all except if there is an unsupportable dr that we can
1946          align.  */
1947       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1948         do_peeling = false;
1949     }
1950
1951   /* If there is an unsupportable data ref, prefer this over all choices so far
1952      since we'd have to discard a chosen peeling except when it accidentally
1953      aligned the unsupportable data ref.  */
1954   if (one_dr_unsupportable)
1955     dr0 = unsupportable_dr;
1956   else if (do_peeling)
1957     {
1958       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1959          TODO: Use nopeel_outside_cost or get rid of it?  */
1960       unsigned nopeel_inside_cost = 0;
1961       unsigned nopeel_outside_cost = 0;
1962
1963       stmt_vector_for_cost dummy;
1964       dummy.create (2);
1965       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1966                                       &nopeel_outside_cost, &dummy, 0, false);
1967       dummy.release ();
1968
1969       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1970          costs will be recorded.  */
1971       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1972       prologue_cost_vec.create (2);
1973       epilogue_cost_vec.create (2);
1974
1975       int dummy2;
1976       nopeel_outside_cost += vect_get_known_peeling_cost
1977         (loop_vinfo, 0, &dummy2,
1978          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1979          &prologue_cost_vec, &epilogue_cost_vec);
1980
1981       prologue_cost_vec.release ();
1982       epilogue_cost_vec.release ();
1983
1984       npeel = best_peel.peel_info.npeel;
1985       dr0 = best_peel.peel_info.dr;
1986
1987       /* If no peeling is not more expensive than the best peeling we
1988          have so far, don't perform any peeling.  */
1989       if (nopeel_inside_cost <= best_peel.inside_cost)
1990         do_peeling = false;
1991     }
1992
1993   if (do_peeling)
1994     {
1995       stmt = DR_STMT (dr0);
1996       stmt_info = vinfo_for_stmt (stmt);
1997       vectype = STMT_VINFO_VECTYPE (stmt_info);
1998
1999       if (known_alignment_for_access_p (dr0))
2000         {
2001           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2002                                                 size_zero_node) < 0;
2003           if (!npeel)
2004             {
2005               /* Since it's known at compile time, compute the number of
2006                  iterations in the peeled loop (the peeling factor) for use in
2007                  updating DR_MISALIGNMENT values.  The peeling factor is the
2008                  vectorization factor minus the misalignment as an element
2009                  count.  */
2010               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2011               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2012               npeel = ((mis & (target_align - 1))
2013                        / vect_get_scalar_dr_size (dr0));
2014             }
2015
2016           /* For interleaved data access every iteration accesses all the
2017              members of the group, therefore we divide the number of iterations
2018              by the group size.  */
2019           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2020           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2021             npeel /= GROUP_SIZE (stmt_info);
2022
2023           if (dump_enabled_p ())
2024             dump_printf_loc (MSG_NOTE, vect_location,
2025                              "Try peeling by %d\n", npeel);
2026         }
2027
2028       /* Ensure that all datarefs can be vectorized after the peel.  */
2029       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2030         do_peeling = false;
2031
2032       /* Check if all datarefs are supportable and log.  */
2033       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2034         {
2035           stat = vect_verify_datarefs_alignment (loop_vinfo);
2036           if (!stat)
2037             do_peeling = false;
2038           else
2039             return stat;
2040         }
2041
2042       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2043       if (do_peeling)
2044         {
2045           unsigned max_allowed_peel
2046             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2047           if (max_allowed_peel != (unsigned)-1)
2048             {
2049               unsigned max_peel = npeel;
2050               if (max_peel == 0)
2051                 {
2052                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2053                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2054                 }
2055               if (max_peel > max_allowed_peel)
2056                 {
2057                   do_peeling = false;
2058                   if (dump_enabled_p ())
2059                     dump_printf_loc (MSG_NOTE, vect_location,
2060                         "Disable peeling, max peels reached: %d\n", max_peel);
2061                 }
2062             }
2063         }
2064
2065       /* Cost model #2 - if peeling may result in a remaining loop not
2066          iterating enough to be vectorized then do not peel.  Since this
2067          is a cost heuristic rather than a correctness decision, use the
2068          most likely runtime value for variable vectorization factors.  */
2069       if (do_peeling
2070           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2071         {
2072           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2073           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2074           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2075               < assumed_vf + max_peel)
2076             do_peeling = false;
2077         }
2078
2079       if (do_peeling)
2080         {
2081           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2082              If the misalignment of DR_i is identical to that of dr0 then set
2083              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2084              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2085              by the peeling factor times the element size of DR_i (MOD the
2086              vectorization factor times the size).  Otherwise, the
2087              misalignment of DR_i must be set to unknown.  */
2088           FOR_EACH_VEC_ELT (datarefs, i, dr)
2089             if (dr != dr0)
2090               {
2091                 /* Strided accesses perform only component accesses, alignment
2092                    is irrelevant for them.  */
2093                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2094                 if (STMT_VINFO_STRIDED_P (stmt_info)
2095                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2096                   continue;
2097
2098                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2099               }
2100
2101           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2102           if (npeel)
2103             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2104           else
2105             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2106               = DR_MISALIGNMENT (dr0);
2107           SET_DR_MISALIGNMENT (dr0, 0);
2108           if (dump_enabled_p ())
2109             {
2110               dump_printf_loc (MSG_NOTE, vect_location,
2111                                "Alignment of access forced using peeling.\n");
2112               dump_printf_loc (MSG_NOTE, vect_location,
2113                                "Peeling for alignment will be applied.\n");
2114             }
2115
2116           /* The inside-loop cost will be accounted for in vectorizable_load
2117              and vectorizable_store correctly with adjusted alignments.
2118              Drop the body_cst_vec on the floor here.  */
2119           stat = vect_verify_datarefs_alignment (loop_vinfo);
2120           gcc_assert (stat);
2121           return stat;
2122         }
2123     }
2124
2125   /* (2) Versioning to force alignment.  */
2126
2127   /* Try versioning if:
2128      1) optimize loop for speed
2129      2) there is at least one unsupported misaligned data ref with an unknown
2130         misalignment, and
2131      3) all misaligned data refs with a known misalignment are supported, and
2132      4) the number of runtime alignment checks is within reason.  */
2133
2134   do_versioning =
2135         optimize_loop_nest_for_speed_p (loop)
2136         && (!loop->inner); /* FORNOW */
2137
2138   if (do_versioning)
2139     {
2140       FOR_EACH_VEC_ELT (datarefs, i, dr)
2141         {
2142           stmt = DR_STMT (dr);
2143           stmt_info = vinfo_for_stmt (stmt);
2144
2145           /* For interleaving, only the alignment of the first access
2146              matters.  */
2147           if (aligned_access_p (dr)
2148               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2149                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2150             continue;
2151
2152           if (STMT_VINFO_STRIDED_P (stmt_info))
2153             {
2154               /* Strided loads perform only component accesses, alignment is
2155                  irrelevant for them.  */
2156               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2157                 continue;
2158               do_versioning = false;
2159               break;
2160             }
2161
2162           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2163
2164           if (!supportable_dr_alignment)
2165             {
2166               gimple *stmt;
2167               int mask;
2168               tree vectype;
2169
2170               if (known_alignment_for_access_p (dr)
2171                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2172                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2173                 {
2174                   do_versioning = false;
2175                   break;
2176                 }
2177
2178               stmt = DR_STMT (dr);
2179               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2180               gcc_assert (vectype);
2181
2182               /* At present we don't support versioning for alignment
2183                  with variable VF, since there's no guarantee that the
2184                  VF is a power of two.  We could relax this if we added
2185                  a way of enforcing a power-of-two size.  */
2186               unsigned HOST_WIDE_INT size;
2187               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2188                 {
2189                   do_versioning = false;
2190                   break;
2191                 }
2192
2193               /* The rightmost bits of an aligned address must be zeros.
2194                  Construct the mask needed for this test.  For example,
2195                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2196                  mask must be 15 = 0xf. */
2197               mask = size - 1;
2198
2199               /* FORNOW: use the same mask to test all potentially unaligned
2200                  references in the loop.  The vectorizer currently supports
2201                  a single vector size, see the reference to
2202                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2203                  vectorization factor is computed.  */
2204               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2205                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2206               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2207               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2208                       DR_STMT (dr));
2209             }
2210         }
2211
2212       /* Versioning requires at least one misaligned data reference.  */
2213       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2214         do_versioning = false;
2215       else if (!do_versioning)
2216         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2217     }
2218
2219   if (do_versioning)
2220     {
2221       vec<gimple *> may_misalign_stmts
2222         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2223       gimple *stmt;
2224
2225       /* It can now be assumed that the data references in the statements
2226          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2227          of the loop being vectorized.  */
2228       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2229         {
2230           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2231           dr = STMT_VINFO_DATA_REF (stmt_info);
2232           SET_DR_MISALIGNMENT (dr, 0);
2233           if (dump_enabled_p ())
2234             dump_printf_loc (MSG_NOTE, vect_location,
2235                              "Alignment of access forced using versioning.\n");
2236         }
2237
2238       if (dump_enabled_p ())
2239         dump_printf_loc (MSG_NOTE, vect_location,
2240                          "Versioning for alignment will be applied.\n");
2241
2242       /* Peeling and versioning can't be done together at this time.  */
2243       gcc_assert (! (do_peeling && do_versioning));
2244
2245       stat = vect_verify_datarefs_alignment (loop_vinfo);
2246       gcc_assert (stat);
2247       return stat;
2248     }
2249
2250   /* This point is reached if neither peeling nor versioning is being done.  */
2251   gcc_assert (! (do_peeling || do_versioning));
2252
2253   stat = vect_verify_datarefs_alignment (loop_vinfo);
2254   return stat;
2255 }
2256
2257
2258 /* Function vect_find_same_alignment_drs.
2259
2260    Update group and alignment relations according to the chosen
2261    vectorization factor.  */
2262
2263 static void
2264 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2265 {
2266   struct data_reference *dra = DDR_A (ddr);
2267   struct data_reference *drb = DDR_B (ddr);
2268   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2269   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2270
2271   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2272     return;
2273
2274   if (dra == drb)
2275     return;
2276
2277   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2278       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2279       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2280     return;
2281
2282   /* Two references with distance zero have the same alignment.  */
2283   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2284                           - wi::to_poly_offset (DR_INIT (drb)));
2285   if (maybe_ne (diff, 0))
2286     {
2287       /* Get the wider of the two alignments.  */
2288       unsigned int align_a = (vect_calculate_target_alignment (dra)
2289                               / BITS_PER_UNIT);
2290       unsigned int align_b = (vect_calculate_target_alignment (drb)
2291                               / BITS_PER_UNIT);
2292       unsigned int max_align = MAX (align_a, align_b);
2293
2294       /* Require the gap to be a multiple of the larger vector alignment.  */
2295       if (!multiple_p (diff, max_align))
2296         return;
2297     }
2298
2299   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2300   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2301   if (dump_enabled_p ())
2302     {
2303       dump_printf_loc (MSG_NOTE, vect_location,
2304                        "accesses have the same alignment: ");
2305       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2306       dump_printf (MSG_NOTE,  " and ");
2307       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2308       dump_printf (MSG_NOTE, "\n");
2309     }
2310 }
2311
2312
2313 /* Function vect_analyze_data_refs_alignment
2314
2315    Analyze the alignment of the data-references in the loop.
2316    Return FALSE if a data reference is found that cannot be vectorized.  */
2317
2318 bool
2319 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2320 {
2321   if (dump_enabled_p ())
2322     dump_printf_loc (MSG_NOTE, vect_location,
2323                      "=== vect_analyze_data_refs_alignment ===\n");
2324
2325   /* Mark groups of data references with same alignment using
2326      data dependence information.  */
2327   vec<ddr_p> ddrs = vinfo->ddrs;
2328   struct data_dependence_relation *ddr;
2329   unsigned int i;
2330
2331   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2332     vect_find_same_alignment_drs (ddr);
2333
2334   vec<data_reference_p> datarefs = vinfo->datarefs;
2335   struct data_reference *dr;
2336
2337   vect_record_base_alignments (vinfo);
2338   FOR_EACH_VEC_ELT (datarefs, i, dr)
2339     {
2340       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2341       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2342           && !vect_compute_data_ref_alignment (dr))
2343         {
2344           /* Strided accesses perform only component accesses, misalignment
2345              information is irrelevant for them.  */
2346           if (STMT_VINFO_STRIDED_P (stmt_info)
2347               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2348             continue;
2349
2350           if (dump_enabled_p ())
2351             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352                              "not vectorized: can't calculate alignment "
2353                              "for data ref.\n");
2354
2355           return false;
2356         }
2357     }
2358
2359   return true;
2360 }
2361
2362
2363 /* Analyze alignment of DRs of stmts in NODE.  */
2364
2365 static bool
2366 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2367 {
2368   /* We vectorize from the first scalar stmt in the node unless
2369      the node is permuted in which case we start from the first
2370      element in the group.  */
2371   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2372   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2373   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2374     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2375
2376   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2377   if (! vect_compute_data_ref_alignment (dr)
2378       /* For creating the data-ref pointer we need alignment of the
2379          first element anyway.  */
2380       || (dr != first_dr
2381           && ! vect_compute_data_ref_alignment (first_dr))
2382       || ! verify_data_ref_alignment (dr))
2383     {
2384       if (dump_enabled_p ())
2385         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2386                          "not vectorized: bad data alignment in basic "
2387                          "block.\n");
2388       return false;
2389     }
2390
2391   return true;
2392 }
2393
2394 /* Function vect_slp_analyze_instance_alignment
2395
2396    Analyze the alignment of the data-references in the SLP instance.
2397    Return FALSE if a data reference is found that cannot be vectorized.  */
2398
2399 bool
2400 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2401 {
2402   if (dump_enabled_p ())
2403     dump_printf_loc (MSG_NOTE, vect_location,
2404                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2405
2406   slp_tree node;
2407   unsigned i;
2408   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2409     if (! vect_slp_analyze_and_verify_node_alignment (node))
2410       return false;
2411
2412   node = SLP_INSTANCE_TREE (instance);
2413   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2414       && ! vect_slp_analyze_and_verify_node_alignment
2415              (SLP_INSTANCE_TREE (instance)))
2416     return false;
2417
2418   return true;
2419 }
2420
2421
2422 /* Analyze groups of accesses: check that DR belongs to a group of
2423    accesses of legal size, step, etc.  Detect gaps, single element
2424    interleaving, and other special cases. Set grouped access info.
2425    Collect groups of strided stores for further use in SLP analysis.
2426    Worker for vect_analyze_group_access.  */
2427
2428 static bool
2429 vect_analyze_group_access_1 (struct data_reference *dr)
2430 {
2431   tree step = DR_STEP (dr);
2432   tree scalar_type = TREE_TYPE (DR_REF (dr));
2433   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2434   gimple *stmt = DR_STMT (dr);
2435   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2436   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2437   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2438   HOST_WIDE_INT dr_step = -1;
2439   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2440   bool slp_impossible = false;
2441
2442   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2443      size of the interleaving group (including gaps).  */
2444   if (tree_fits_shwi_p (step))
2445     {
2446       dr_step = tree_to_shwi (step);
2447       /* Check that STEP is a multiple of type size.  Otherwise there is
2448          a non-element-sized gap at the end of the group which we
2449          cannot represent in GROUP_GAP or GROUP_SIZE.
2450          ???  As we can handle non-constant step fine here we should
2451          simply remove uses of GROUP_GAP between the last and first
2452          element and instead rely on DR_STEP.  GROUP_SIZE then would
2453          simply not include that gap.  */
2454       if ((dr_step % type_size) != 0)
2455         {
2456           if (dump_enabled_p ())
2457             {
2458               dump_printf_loc (MSG_NOTE, vect_location,
2459                                "Step ");
2460               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2461               dump_printf (MSG_NOTE,
2462                            " is not a multiple of the element size for ");
2463               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2464               dump_printf (MSG_NOTE, "\n");
2465             }
2466           return false;
2467         }
2468       groupsize = absu_hwi (dr_step) / type_size;
2469     }
2470   else
2471     groupsize = 0;
2472
2473   /* Not consecutive access is possible only if it is a part of interleaving.  */
2474   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2475     {
2476       /* Check if it this DR is a part of interleaving, and is a single
2477          element of the group that is accessed in the loop.  */
2478
2479       /* Gaps are supported only for loads. STEP must be a multiple of the type
2480          size.  */
2481       if (DR_IS_READ (dr)
2482           && (dr_step % type_size) == 0
2483           && groupsize > 0)
2484         {
2485           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2486           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2487           GROUP_GAP (stmt_info) = groupsize - 1;
2488           if (dump_enabled_p ())
2489             {
2490               dump_printf_loc (MSG_NOTE, vect_location,
2491                                "Detected single element interleaving ");
2492               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2493               dump_printf (MSG_NOTE, " step ");
2494               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2495               dump_printf (MSG_NOTE, "\n");
2496             }
2497
2498           return true;
2499         }
2500
2501       if (dump_enabled_p ())
2502         {
2503           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2504                            "not consecutive access ");
2505           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2506         }
2507
2508       if (bb_vinfo)
2509         {
2510           /* Mark the statement as unvectorizable.  */
2511           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2512           return true;
2513         }
2514
2515       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2516       STMT_VINFO_STRIDED_P (stmt_info) = true;
2517       return true;
2518     }
2519
2520   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2521     {
2522       /* First stmt in the interleaving chain. Check the chain.  */
2523       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2524       struct data_reference *data_ref = dr;
2525       unsigned int count = 1;
2526       tree prev_init = DR_INIT (data_ref);
2527       gimple *prev = stmt;
2528       HOST_WIDE_INT diff, gaps = 0;
2529
2530       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2531       while (next)
2532         {
2533           /* Skip same data-refs.  In case that two or more stmts share
2534              data-ref (supported only for loads), we vectorize only the first
2535              stmt, and the rest get their vectorized loads from the first
2536              one.  */
2537           if (!tree_int_cst_compare (DR_INIT (data_ref),
2538                                      DR_INIT (STMT_VINFO_DATA_REF (
2539                                                    vinfo_for_stmt (next)))))
2540             {
2541               if (DR_IS_WRITE (data_ref))
2542                 {
2543                   if (dump_enabled_p ())
2544                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2545                                      "Two store stmts share the same dr.\n");
2546                   return false;
2547                 }
2548
2549               if (dump_enabled_p ())
2550                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551                                  "Two or more load stmts share the same dr.\n");
2552
2553               /* For load use the same data-ref load.  */
2554               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2555
2556               prev = next;
2557               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2558               continue;
2559             }
2560
2561           prev = next;
2562           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2563
2564           /* All group members have the same STEP by construction.  */
2565           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2566
2567           /* Check that the distance between two accesses is equal to the type
2568              size. Otherwise, we have gaps.  */
2569           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2570                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2571           if (diff != 1)
2572             {
2573               /* FORNOW: SLP of accesses with gaps is not supported.  */
2574               slp_impossible = true;
2575               if (DR_IS_WRITE (data_ref))
2576                 {
2577                   if (dump_enabled_p ())
2578                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2579                                      "interleaved store with gaps\n");
2580                   return false;
2581                 }
2582
2583               gaps += diff - 1;
2584             }
2585
2586           last_accessed_element += diff;
2587
2588           /* Store the gap from the previous member of the group. If there is no
2589              gap in the access, GROUP_GAP is always 1.  */
2590           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2591
2592           prev_init = DR_INIT (data_ref);
2593           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2594           /* Count the number of data-refs in the chain.  */
2595           count++;
2596         }
2597
2598       if (groupsize == 0)
2599         groupsize = count + gaps;
2600
2601       /* This could be UINT_MAX but as we are generating code in a very
2602          inefficient way we have to cap earlier.  See PR78699 for example.  */
2603       if (groupsize > 4096)
2604         {
2605           if (dump_enabled_p ())
2606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2607                              "group is too large\n");
2608           return false;
2609         }
2610
2611       /* Check that the size of the interleaving is equal to count for stores,
2612          i.e., that there are no gaps.  */
2613       if (groupsize != count
2614           && !DR_IS_READ (dr))
2615         {
2616           if (dump_enabled_p ())
2617             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2618                              "interleaved store with gaps\n");
2619           return false;
2620         }
2621
2622       /* If there is a gap after the last load in the group it is the
2623          difference between the groupsize and the last accessed
2624          element.
2625          When there is no gap, this difference should be 0.  */
2626       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2627
2628       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2629       if (dump_enabled_p ())
2630         {
2631           dump_printf_loc (MSG_NOTE, vect_location,
2632                            "Detected interleaving ");
2633           if (DR_IS_READ (dr))
2634             dump_printf (MSG_NOTE, "load ");
2635           else
2636             dump_printf (MSG_NOTE, "store ");
2637           dump_printf (MSG_NOTE, "of size %u starting with ",
2638                        (unsigned)groupsize);
2639           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2640           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2641             dump_printf_loc (MSG_NOTE, vect_location,
2642                              "There is a gap of %u elements after the group\n",
2643                              GROUP_GAP (vinfo_for_stmt (stmt)));
2644         }
2645
2646       /* SLP: create an SLP data structure for every interleaving group of
2647          stores for further analysis in vect_analyse_slp.  */
2648       if (DR_IS_WRITE (dr) && !slp_impossible)
2649         {
2650           if (loop_vinfo)
2651             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2652           if (bb_vinfo)
2653             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2654         }
2655     }
2656
2657   return true;
2658 }
2659
2660 /* Analyze groups of accesses: check that DR belongs to a group of
2661    accesses of legal size, step, etc.  Detect gaps, single element
2662    interleaving, and other special cases. Set grouped access info.
2663    Collect groups of strided stores for further use in SLP analysis.  */
2664
2665 static bool
2666 vect_analyze_group_access (struct data_reference *dr)
2667 {
2668   if (!vect_analyze_group_access_1 (dr))
2669     {
2670       /* Dissolve the group if present.  */
2671       gimple *next;
2672       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2673       while (stmt)
2674         {
2675           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2676           next = GROUP_NEXT_ELEMENT (vinfo);
2677           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2678           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2679           stmt = next;
2680         }
2681       return false;
2682     }
2683   return true;
2684 }
2685
2686 /* Analyze the access pattern of the data-reference DR.
2687    In case of non-consecutive accesses call vect_analyze_group_access() to
2688    analyze groups of accesses.  */
2689
2690 static bool
2691 vect_analyze_data_ref_access (struct data_reference *dr)
2692 {
2693   tree step = DR_STEP (dr);
2694   tree scalar_type = TREE_TYPE (DR_REF (dr));
2695   gimple *stmt = DR_STMT (dr);
2696   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2697   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2698   struct loop *loop = NULL;
2699
2700   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2701     return true;
2702
2703   if (loop_vinfo)
2704     loop = LOOP_VINFO_LOOP (loop_vinfo);
2705
2706   if (loop_vinfo && !step)
2707     {
2708       if (dump_enabled_p ())
2709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2710                          "bad data-ref access in loop\n");
2711       return false;
2712     }
2713
2714   /* Allow loads with zero step in inner-loop vectorization.  */
2715   if (loop_vinfo && integer_zerop (step))
2716     {
2717       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2718       if (!nested_in_vect_loop_p (loop, stmt))
2719         return DR_IS_READ (dr);
2720       /* Allow references with zero step for outer loops marked
2721          with pragma omp simd only - it guarantees absence of
2722          loop-carried dependencies between inner loop iterations.  */
2723       if (loop->safelen < 2)
2724         {
2725           if (dump_enabled_p ())
2726             dump_printf_loc (MSG_NOTE, vect_location,
2727                              "zero step in inner loop of nest\n");
2728           return false;
2729         }
2730     }
2731
2732   if (loop && nested_in_vect_loop_p (loop, stmt))
2733     {
2734       /* Interleaved accesses are not yet supported within outer-loop
2735         vectorization for references in the inner-loop.  */
2736       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2737
2738       /* For the rest of the analysis we use the outer-loop step.  */
2739       step = STMT_VINFO_DR_STEP (stmt_info);
2740       if (integer_zerop (step))
2741         {
2742           if (dump_enabled_p ())
2743             dump_printf_loc (MSG_NOTE, vect_location,
2744                              "zero step in outer loop.\n");
2745           return DR_IS_READ (dr);
2746         }
2747     }
2748
2749   /* Consecutive?  */
2750   if (TREE_CODE (step) == INTEGER_CST)
2751     {
2752       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2753       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2754           || (dr_step < 0
2755               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2756         {
2757           /* Mark that it is not interleaving.  */
2758           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2759           return true;
2760         }
2761     }
2762
2763   if (loop && nested_in_vect_loop_p (loop, stmt))
2764     {
2765       if (dump_enabled_p ())
2766         dump_printf_loc (MSG_NOTE, vect_location,
2767                          "grouped access in outer loop.\n");
2768       return false;
2769     }
2770
2771
2772   /* Assume this is a DR handled by non-constant strided load case.  */
2773   if (TREE_CODE (step) != INTEGER_CST)
2774     return (STMT_VINFO_STRIDED_P (stmt_info)
2775             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2776                 || vect_analyze_group_access (dr)));
2777
2778   /* Not consecutive access - check if it's a part of interleaving group.  */
2779   return vect_analyze_group_access (dr);
2780 }
2781
2782 /* Compare two data-references DRA and DRB to group them into chunks
2783    suitable for grouping.  */
2784
2785 static int
2786 dr_group_sort_cmp (const void *dra_, const void *drb_)
2787 {
2788   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2789   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2790   int cmp;
2791
2792   /* Stabilize sort.  */
2793   if (dra == drb)
2794     return 0;
2795
2796   /* DRs in different loops never belong to the same group.  */
2797   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2798   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2799   if (loopa != loopb)
2800     return loopa->num < loopb->num ? -1 : 1;
2801
2802   /* Ordering of DRs according to base.  */
2803   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2804                                DR_BASE_ADDRESS (drb));
2805   if (cmp != 0)
2806     return cmp;
2807
2808   /* And according to DR_OFFSET.  */
2809   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2810   if (cmp != 0)
2811     return cmp;
2812
2813   /* Put reads before writes.  */
2814   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2815     return DR_IS_READ (dra) ? -1 : 1;
2816
2817   /* Then sort after access size.  */
2818   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2819                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2820   if (cmp != 0)
2821     return cmp;
2822
2823   /* And after step.  */
2824   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2825   if (cmp != 0)
2826     return cmp;
2827
2828   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2829   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2830   if (cmp == 0)
2831     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2832   return cmp;
2833 }
2834
2835 /* If OP is the result of a conversion, return the unconverted value,
2836    otherwise return null.  */
2837
2838 static tree
2839 strip_conversion (tree op)
2840 {
2841   if (TREE_CODE (op) != SSA_NAME)
2842     return NULL_TREE;
2843   gimple *stmt = SSA_NAME_DEF_STMT (op);
2844   if (!is_gimple_assign (stmt)
2845       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2846     return NULL_TREE;
2847   return gimple_assign_rhs1 (stmt);
2848 }
2849
2850 /* Return true if vectorizable_* routines can handle statements STMT1
2851    and STMT2 being in a single group.  */
2852
2853 static bool
2854 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2855 {
2856   if (gimple_assign_single_p (stmt1))
2857     return gimple_assign_single_p (stmt2);
2858
2859   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2860     {
2861       /* Check for two masked loads or two masked stores.  */
2862       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2863         return false;
2864       internal_fn ifn = gimple_call_internal_fn (stmt1);
2865       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2866         return false;
2867       if (ifn != gimple_call_internal_fn (stmt2))
2868         return false;
2869
2870       /* Check that the masks are the same.  Cope with casts of masks,
2871          like those created by build_mask_conversion.  */
2872       tree mask1 = gimple_call_arg (stmt1, 2);
2873       tree mask2 = gimple_call_arg (stmt2, 2);
2874       if (!operand_equal_p (mask1, mask2, 0))
2875         {
2876           mask1 = strip_conversion (mask1);
2877           if (!mask1)
2878             return false;
2879           mask2 = strip_conversion (mask2);
2880           if (!mask2)
2881             return false;
2882           if (!operand_equal_p (mask1, mask2, 0))
2883             return false;
2884         }
2885       return true;
2886     }
2887
2888   return false;
2889 }
2890
2891 /* Function vect_analyze_data_ref_accesses.
2892
2893    Analyze the access pattern of all the data references in the loop.
2894
2895    FORNOW: the only access pattern that is considered vectorizable is a
2896            simple step 1 (consecutive) access.
2897
2898    FORNOW: handle only arrays and pointer accesses.  */
2899
2900 bool
2901 vect_analyze_data_ref_accesses (vec_info *vinfo)
2902 {
2903   unsigned int i;
2904   vec<data_reference_p> datarefs = vinfo->datarefs;
2905   struct data_reference *dr;
2906
2907   if (dump_enabled_p ())
2908     dump_printf_loc (MSG_NOTE, vect_location,
2909                      "=== vect_analyze_data_ref_accesses ===\n");
2910
2911   if (datarefs.is_empty ())
2912     return true;
2913
2914   /* Sort the array of datarefs to make building the interleaving chains
2915      linear.  Don't modify the original vector's order, it is needed for
2916      determining what dependencies are reversed.  */
2917   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2918   datarefs_copy.qsort (dr_group_sort_cmp);
2919
2920   /* Build the interleaving chains.  */
2921   for (i = 0; i < datarefs_copy.length () - 1;)
2922     {
2923       data_reference_p dra = datarefs_copy[i];
2924       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2925       stmt_vec_info lastinfo = NULL;
2926       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2927           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2928         {
2929           ++i;
2930           continue;
2931         }
2932       for (i = i + 1; i < datarefs_copy.length (); ++i)
2933         {
2934           data_reference_p drb = datarefs_copy[i];
2935           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2936           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2937               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2938             break;
2939
2940           /* ???  Imperfect sorting (non-compatible types, non-modulo
2941              accesses, same accesses) can lead to a group to be artificially
2942              split here as we don't just skip over those.  If it really
2943              matters we can push those to a worklist and re-iterate
2944              over them.  The we can just skip ahead to the next DR here.  */
2945
2946           /* DRs in a different loop should not be put into the same
2947              interleaving group.  */
2948           if (gimple_bb (DR_STMT (dra))->loop_father
2949               != gimple_bb (DR_STMT (drb))->loop_father)
2950             break;
2951
2952           /* Check that the data-refs have same first location (except init)
2953              and they are both either store or load (not load and store,
2954              not masked loads or stores).  */
2955           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2956               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2957                                         DR_BASE_ADDRESS (drb)) != 0
2958               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2959               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2960             break;
2961
2962           /* Check that the data-refs have the same constant size.  */
2963           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2964           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2965           if (!tree_fits_uhwi_p (sza)
2966               || !tree_fits_uhwi_p (szb)
2967               || !tree_int_cst_equal (sza, szb))
2968             break;
2969
2970           /* Check that the data-refs have the same step.  */
2971           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2972             break;
2973
2974           /* Check the types are compatible.
2975              ???  We don't distinguish this during sorting.  */
2976           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2977                                    TREE_TYPE (DR_REF (drb))))
2978             break;
2979
2980           /* Check that the DR_INITs are compile-time constants.  */
2981           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2982               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2983             break;
2984
2985           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2986           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2987           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2988           HOST_WIDE_INT init_prev
2989             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2990           gcc_assert (init_a <= init_b
2991                       && init_a <= init_prev
2992                       && init_prev <= init_b);
2993
2994           /* Do not place the same access in the interleaving chain twice.  */
2995           if (init_b == init_prev)
2996             {
2997               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2998                           < gimple_uid (DR_STMT (drb)));
2999               /* ???  For now we simply "drop" the later reference which is
3000                  otherwise the same rather than finishing off this group.
3001                  In the end we'd want to re-process duplicates forming
3002                  multiple groups from the refs, likely by just collecting
3003                  all candidates (including duplicates and split points
3004                  below) in a vector and then process them together.  */
3005               continue;
3006             }
3007
3008           /* If init_b == init_a + the size of the type * k, we have an
3009              interleaving, and DRA is accessed before DRB.  */
3010           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3011           if (type_size_a == 0
3012               || (init_b - init_a) % type_size_a != 0)
3013             break;
3014
3015           /* If we have a store, the accesses are adjacent.  This splits
3016              groups into chunks we support (we don't support vectorization
3017              of stores with gaps).  */
3018           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3019             break;
3020
3021           /* If the step (if not zero or non-constant) is greater than the
3022              difference between data-refs' inits this splits groups into
3023              suitable sizes.  */
3024           if (tree_fits_shwi_p (DR_STEP (dra)))
3025             {
3026               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3027               if (step != 0 && step <= (init_b - init_a))
3028                 break;
3029             }
3030
3031           if (dump_enabled_p ())
3032             {
3033               dump_printf_loc (MSG_NOTE, vect_location,
3034                                "Detected interleaving ");
3035               if (DR_IS_READ (dra))
3036                 dump_printf (MSG_NOTE, "load ");
3037               else
3038                 dump_printf (MSG_NOTE, "store ");
3039               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3040               dump_printf (MSG_NOTE,  " and ");
3041               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3042               dump_printf (MSG_NOTE, "\n");
3043             }
3044
3045           /* Link the found element into the group list.  */
3046           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
3047             {
3048               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3049               lastinfo = stmtinfo_a;
3050             }
3051           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3052           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3053           lastinfo = stmtinfo_b;
3054         }
3055     }
3056
3057   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3058     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3059         && !vect_analyze_data_ref_access (dr))
3060       {
3061         if (dump_enabled_p ())
3062           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063                            "not vectorized: complicated access pattern.\n");
3064
3065         if (is_a <bb_vec_info> (vinfo))
3066           {
3067             /* Mark the statement as not vectorizable.  */
3068             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3069             continue;
3070           }
3071         else
3072           {
3073             datarefs_copy.release ();
3074             return false;
3075           }
3076       }
3077
3078   datarefs_copy.release ();
3079   return true;
3080 }
3081
3082 /* Function vect_vfa_segment_size.
3083
3084    Input:
3085      DR: The data reference.
3086      LENGTH_FACTOR: segment length to consider.
3087
3088    Return a value suitable for the dr_with_seg_len::seg_len field.
3089    This is the "distance travelled" by the pointer from the first
3090    iteration in the segment to the last.  Note that it does not include
3091    the size of the access; in effect it only describes the first byte.  */
3092
3093 static tree
3094 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3095 {
3096   length_factor = size_binop (MINUS_EXPR,
3097                               fold_convert (sizetype, length_factor),
3098                               size_one_node);
3099   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3100                      length_factor);
3101 }
3102
3103 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3104    gives the worst-case number of bytes covered by the segment.  */
3105
3106 static unsigned HOST_WIDE_INT
3107 vect_vfa_access_size (data_reference *dr)
3108 {
3109   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3110   tree ref_type = TREE_TYPE (DR_REF (dr));
3111   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3112   unsigned HOST_WIDE_INT access_size = ref_size;
3113   if (GROUP_FIRST_ELEMENT (stmt_vinfo))
3114     {
3115       gcc_assert (GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3116       access_size *= GROUP_SIZE (stmt_vinfo) - GROUP_GAP (stmt_vinfo);
3117     }
3118   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3119       && (vect_supportable_dr_alignment (dr, false)
3120           == dr_explicit_realign_optimized))
3121     {
3122       /* We might access a full vector's worth.  */
3123       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3124       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3125     }
3126   return access_size;
3127 }
3128
3129 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3130
3131 static unsigned int
3132 vect_vfa_align (const data_reference *dr)
3133 {
3134   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3135 }
3136
3137 /* Function vect_no_alias_p.
3138
3139    Given data references A and B with equal base and offset, see whether
3140    the alias relation can be decided at compilation time.  Return 1 if
3141    it can and the references alias, 0 if it can and the references do
3142    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3143    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3144    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3145
3146 static int
3147 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3148                          tree segment_length_a, tree segment_length_b,
3149                          unsigned HOST_WIDE_INT access_size_a,
3150                          unsigned HOST_WIDE_INT access_size_b)
3151 {
3152   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3153   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3154   poly_uint64 const_length_a;
3155   poly_uint64 const_length_b;
3156
3157   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3158      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3159      [a, a+12) */
3160   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3161     {
3162       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3163       offset_a = (offset_a + access_size_a) - const_length_a;
3164     }
3165   else
3166     const_length_a = tree_to_poly_uint64 (segment_length_a);
3167   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3168     {
3169       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3170       offset_b = (offset_b + access_size_b) - const_length_b;
3171     }
3172   else
3173     const_length_b = tree_to_poly_uint64 (segment_length_b);
3174
3175   const_length_a += access_size_a;
3176   const_length_b += access_size_b;
3177
3178   if (ranges_known_overlap_p (offset_a, const_length_a,
3179                               offset_b, const_length_b))
3180     return 1;
3181
3182   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3183                                offset_b, const_length_b))
3184     return 0;
3185
3186   return -1;
3187 }
3188
3189 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3190    in DDR is >= VF.  */
3191
3192 static bool
3193 dependence_distance_ge_vf (data_dependence_relation *ddr,
3194                            unsigned int loop_depth, poly_uint64 vf)
3195 {
3196   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3197       || DDR_NUM_DIST_VECTS (ddr) == 0)
3198     return false;
3199
3200   /* If the dependence is exact, we should have limited the VF instead.  */
3201   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3202
3203   unsigned int i;
3204   lambda_vector dist_v;
3205   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3206     {
3207       HOST_WIDE_INT dist = dist_v[loop_depth];
3208       if (dist != 0
3209           && !(dist > 0 && DDR_REVERSED_P (ddr))
3210           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3211         return false;
3212     }
3213
3214   if (dump_enabled_p ())
3215     {
3216       dump_printf_loc (MSG_NOTE, vect_location,
3217                        "dependence distance between ");
3218       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3219       dump_printf (MSG_NOTE,  " and ");
3220       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3221       dump_printf (MSG_NOTE,  " is >= VF\n");
3222     }
3223
3224   return true;
3225 }
3226
3227 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3228
3229 static void
3230 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3231 {
3232   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3233   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3234   dump_printf (dump_kind, ") >= ");
3235   dump_dec (dump_kind, lower_bound.min_value);
3236 }
3237
3238 /* Record that the vectorized loop requires the vec_lower_bound described
3239    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3240
3241 static void
3242 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3243                         poly_uint64 min_value)
3244 {
3245   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3246   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3247     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3248       {
3249         unsigned_p &= lower_bounds[i].unsigned_p;
3250         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3251         if (lower_bounds[i].unsigned_p != unsigned_p
3252             || maybe_lt (lower_bounds[i].min_value, min_value))
3253           {
3254             lower_bounds[i].unsigned_p = unsigned_p;
3255             lower_bounds[i].min_value = min_value;
3256             if (dump_enabled_p ())
3257               {
3258                 dump_printf_loc (MSG_NOTE, vect_location,
3259                                  "updating run-time check to ");
3260                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3261                 dump_printf (MSG_NOTE, "\n");
3262               }
3263           }
3264         return;
3265       }
3266
3267   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3268   if (dump_enabled_p ())
3269     {
3270       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3271       dump_lower_bound (MSG_NOTE, lower_bound);
3272       dump_printf (MSG_NOTE, "\n");
3273     }
3274   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3275 }
3276
3277 /* Return true if it's unlikely that the step of the vectorized form of DR
3278    will span fewer than GAP bytes.  */
3279
3280 static bool
3281 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3282 {
3283   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3284   HOST_WIDE_INT count
3285     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3286   if (GROUP_FIRST_ELEMENT (stmt_info))
3287     count *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
3288   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3289 }
3290
3291 /* Return true if we know that there is no alias between DR_A and DR_B
3292    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3293    *LOWER_BOUND_OUT to this N.  */
3294
3295 static bool
3296 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3297                                 poly_uint64 *lower_bound_out)
3298 {
3299   /* Check that there is a constant gap of known sign between DR_A
3300      and DR_B.  */
3301   poly_int64 init_a, init_b;
3302   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3303       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3304       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3305       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3306       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3307       || !ordered_p (init_a, init_b))
3308     return false;
3309
3310   /* Sort DR_A and DR_B by the address they access.  */
3311   if (maybe_lt (init_b, init_a))
3312     {
3313       std::swap (init_a, init_b);
3314       std::swap (dr_a, dr_b);
3315     }
3316
3317   /* If the two accesses could be dependent within a scalar iteration,
3318      make sure that we'd retain their order.  */
3319   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3320       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3321     return false;
3322
3323   /* There is no alias if abs (DR_STEP) is greater than or equal to
3324      the bytes spanned by the combination of the two accesses.  */
3325   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3326   return true;
3327 }
3328
3329 /* Function vect_prune_runtime_alias_test_list.
3330
3331    Prune a list of ddrs to be tested at run-time by versioning for alias.
3332    Merge several alias checks into one if possible.
3333    Return FALSE if resulting list of ddrs is longer then allowed by
3334    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3335
3336 bool
3337 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3338 {
3339   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3340   hash_set <tree_pair_hash> compared_objects;
3341
3342   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3343   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3344     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3345   vec<vec_object_pair> &check_unequal_addrs
3346     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3347   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3348   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3349
3350   ddr_p ddr;
3351   unsigned int i;
3352   tree length_factor;
3353
3354   if (dump_enabled_p ())
3355     dump_printf_loc (MSG_NOTE, vect_location,
3356                      "=== vect_prune_runtime_alias_test_list ===\n");
3357
3358   /* Step values are irrelevant for aliasing if the number of vector
3359      iterations is equal to the number of scalar iterations (which can
3360      happen for fully-SLP loops).  */
3361   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3362
3363   if (!ignore_step_p)
3364     {
3365       /* Convert the checks for nonzero steps into bound tests.  */
3366       tree value;
3367       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3368         vect_check_lower_bound (loop_vinfo, value, true, 1);
3369     }
3370
3371   if (may_alias_ddrs.is_empty ())
3372     return true;
3373
3374   comp_alias_ddrs.create (may_alias_ddrs.length ());
3375
3376   unsigned int loop_depth
3377     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3378                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3379
3380   /* First, we collect all data ref pairs for aliasing checks.  */
3381   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3382     {
3383       int comp_res;
3384       poly_uint64 lower_bound;
3385       struct data_reference *dr_a, *dr_b;
3386       gimple *dr_group_first_a, *dr_group_first_b;
3387       tree segment_length_a, segment_length_b;
3388       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3389       unsigned int align_a, align_b;
3390       gimple *stmt_a, *stmt_b;
3391
3392       /* Ignore the alias if the VF we chose ended up being no greater
3393          than the dependence distance.  */
3394       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3395         continue;
3396
3397       if (DDR_OBJECT_A (ddr))
3398         {
3399           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3400           if (!compared_objects.add (new_pair))
3401             {
3402               if (dump_enabled_p ())
3403                 {
3404                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3405                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3406                   dump_printf (MSG_NOTE, " and ");
3407                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3408                   dump_printf (MSG_NOTE, " have different addresses\n");
3409                 }
3410               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3411             }
3412           continue;
3413         }
3414
3415       dr_a = DDR_A (ddr);
3416       stmt_a = DR_STMT (DDR_A (ddr));
3417
3418       dr_b = DDR_B (ddr);
3419       stmt_b = DR_STMT (DDR_B (ddr));
3420
3421       /* Skip the pair if inter-iteration dependencies are irrelevant
3422          and intra-iteration dependencies are guaranteed to be honored.  */
3423       if (ignore_step_p
3424           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3425               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3426         {
3427           if (dump_enabled_p ())
3428             {
3429               dump_printf_loc (MSG_NOTE, vect_location,
3430                                "no need for alias check between ");
3431               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3432               dump_printf (MSG_NOTE, " and ");
3433               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3434               dump_printf (MSG_NOTE, " when VF is 1\n");
3435             }
3436           continue;
3437         }
3438
3439       /* See whether we can handle the alias using a bounds check on
3440          the step, and whether that's likely to be the best approach.
3441          (It might not be, for example, if the minimum step is much larger
3442          than the number of bytes handled by one vector iteration.)  */
3443       if (!ignore_step_p
3444           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3445           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3446           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3447               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3448         {
3449           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3450           if (dump_enabled_p ())
3451             {
3452               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3453               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3454               dump_printf (MSG_NOTE, " and ");
3455               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3456               dump_printf (MSG_NOTE, " when the step ");
3457               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3458               dump_printf (MSG_NOTE, " is outside ");
3459               if (unsigned_p)
3460                 dump_printf (MSG_NOTE, "[0");
3461               else
3462                 {
3463                   dump_printf (MSG_NOTE, "(");
3464                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3465                 }
3466               dump_printf (MSG_NOTE, ", ");
3467               dump_dec (MSG_NOTE, lower_bound);
3468               dump_printf (MSG_NOTE, ")\n");
3469             }
3470           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3471                                   lower_bound);
3472           continue;
3473         }
3474
3475       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3476       if (dr_group_first_a)
3477         {
3478           stmt_a = dr_group_first_a;
3479           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3480         }
3481
3482       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3483       if (dr_group_first_b)
3484         {
3485           stmt_b = dr_group_first_b;
3486           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3487         }
3488
3489       if (ignore_step_p)
3490         {
3491           segment_length_a = size_zero_node;
3492           segment_length_b = size_zero_node;
3493         }
3494       else
3495         {
3496           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3497             length_factor = scalar_loop_iters;
3498           else
3499             length_factor = size_int (vect_factor);
3500           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3501           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3502         }
3503       access_size_a = vect_vfa_access_size (dr_a);
3504       access_size_b = vect_vfa_access_size (dr_b);
3505       align_a = vect_vfa_align (dr_a);
3506       align_b = vect_vfa_align (dr_b);
3507
3508       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3509                                         DR_BASE_ADDRESS (dr_b));
3510       if (comp_res == 0)
3511         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3512                                           DR_OFFSET (dr_b));
3513
3514       /* See whether the alias is known at compilation time.  */
3515       if (comp_res == 0
3516           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3517           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3518           && poly_int_tree_p (segment_length_a)
3519           && poly_int_tree_p (segment_length_b))
3520         {
3521           int res = vect_compile_time_alias (dr_a, dr_b,
3522                                              segment_length_a,
3523                                              segment_length_b,
3524                                              access_size_a,
3525                                              access_size_b);
3526           if (res >= 0 && dump_enabled_p ())
3527             {
3528               dump_printf_loc (MSG_NOTE, vect_location,
3529                                "can tell at compile time that ");
3530               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3531               dump_printf (MSG_NOTE, " and ");
3532               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3533               if (res == 0)
3534                 dump_printf (MSG_NOTE, " do not alias\n");
3535               else
3536                 dump_printf (MSG_NOTE, " alias\n");
3537             }
3538
3539           if (res == 0)
3540             continue;
3541
3542           if (res == 1)
3543             {
3544               if (dump_enabled_p ())
3545                 dump_printf_loc (MSG_NOTE, vect_location,
3546                                  "not vectorized: compilation time alias.\n");
3547               return false;
3548             }
3549         }
3550
3551       dr_with_seg_len_pair_t dr_with_seg_len_pair
3552         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3553          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3554
3555       /* Canonicalize pairs by sorting the two DR members.  */
3556       if (comp_res > 0)
3557         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3558
3559       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3560     }
3561
3562   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3563
3564   unsigned int count = (comp_alias_ddrs.length ()
3565                         + check_unequal_addrs.length ());
3566
3567   dump_printf_loc (MSG_NOTE, vect_location,
3568                    "improved number of alias checks from %d to %d\n",
3569                    may_alias_ddrs.length (), count);
3570   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3571     {
3572       if (dump_enabled_p ())
3573         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3574                          "number of versioning for alias "
3575                          "run-time tests exceeds %d "
3576                          "(--param vect-max-version-for-alias-checks)\n",
3577                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3578       return false;
3579     }
3580
3581   return true;
3582 }
3583
3584 /* Check whether we can use an internal function for a gather load
3585    or scatter store.  READ_P is true for loads and false for stores.
3586    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3587    the type of the memory elements being loaded or stored.  OFFSET_BITS
3588    is the number of bits in each scalar offset and OFFSET_SIGN is the
3589    sign of the offset.  SCALE is the amount by which the offset should
3590    be multiplied *after* it has been converted to address width.
3591
3592    Return true if the function is supported, storing the function
3593    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3594
3595 bool
3596 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3597                           tree memory_type, unsigned int offset_bits,
3598                           signop offset_sign, int scale,
3599                           internal_fn *ifn_out, tree *element_type_out)
3600 {
3601   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3602   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3603   if (offset_bits > element_bits)
3604     /* Internal functions require the offset to be the same width as
3605        the vector elements.  We can extend narrower offsets, but it isn't
3606        safe to truncate wider offsets.  */
3607     return false;
3608
3609   if (element_bits != memory_bits)
3610     /* For now the vector elements must be the same width as the
3611        memory elements.  */
3612     return false;
3613
3614   /* Work out which function we need.  */
3615   internal_fn ifn;
3616   if (read_p)
3617     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3618   else
3619     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3620
3621   /* Test whether the target supports this combination.  */
3622   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3623                                                offset_sign, scale))
3624     return false;
3625
3626   *ifn_out = ifn;
3627   *element_type_out = TREE_TYPE (vectype);
3628   return true;
3629 }
3630
3631 /* CALL is a call to an internal gather load or scatter store function.
3632    Describe the operation in INFO.  */
3633
3634 static void
3635 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3636 {
3637   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3638   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3639   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3640
3641   info->ifn = gimple_call_internal_fn (call);
3642   info->decl = NULL_TREE;
3643   info->base = gimple_call_arg (call, 0);
3644   info->offset = gimple_call_arg (call, 1);
3645   info->offset_dt = vect_unknown_def_type;
3646   info->offset_vectype = NULL_TREE;
3647   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3648   info->element_type = TREE_TYPE (vectype);
3649   info->memory_type = TREE_TYPE (DR_REF (dr));
3650 }
3651
3652 /* Return true if a non-affine read or write in STMT is suitable for a
3653    gather load or scatter store.  Describe the operation in *INFO if so.  */
3654
3655 bool
3656 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3657                            gather_scatter_info *info)
3658 {
3659   HOST_WIDE_INT scale = 1;
3660   poly_int64 pbitpos, pbitsize;
3661   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3662   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3663   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3664   tree offtype = NULL_TREE;
3665   tree decl = NULL_TREE, base, off;
3666   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3667   tree memory_type = TREE_TYPE (DR_REF (dr));
3668   machine_mode pmode;
3669   int punsignedp, reversep, pvolatilep = 0;
3670   internal_fn ifn;
3671   tree element_type;
3672   bool masked_p = false;
3673
3674   /* See whether this is already a call to a gather/scatter internal function.
3675      If not, see whether it's a masked load or store.  */
3676   gcall *call = dyn_cast <gcall *> (stmt);
3677   if (call && gimple_call_internal_p (call))
3678     {
3679       ifn = gimple_call_internal_fn (stmt);
3680       if (internal_gather_scatter_fn_p (ifn))
3681         {
3682           vect_describe_gather_scatter_call (call, info);
3683           return true;
3684         }
3685       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3686     }
3687
3688   /* True if we should aim to use internal functions rather than
3689      built-in functions.  */
3690   bool use_ifn_p = (DR_IS_READ (dr)
3691                     ? supports_vec_gather_load_p ()
3692                     : supports_vec_scatter_store_p ());
3693
3694   base = DR_REF (dr);
3695   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3696      see if we can use the def stmt of the address.  */
3697   if (masked_p
3698       && TREE_CODE (base) == MEM_REF
3699       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3700       && integer_zerop (TREE_OPERAND (base, 1))
3701       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3702     {
3703       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3704       if (is_gimple_assign (def_stmt)
3705           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3706         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3707     }
3708
3709   /* The gather and scatter builtins need address of the form
3710      loop_invariant + vector * {1, 2, 4, 8}
3711      or
3712      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3713      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3714      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3715      multiplications and additions in it.  To get a vector, we need
3716      a single SSA_NAME that will be defined in the loop and will
3717      contain everything that is not loop invariant and that can be
3718      vectorized.  The following code attempts to find such a preexistng
3719      SSA_NAME OFF and put the loop invariants into a tree BASE
3720      that can be gimplified before the loop.  */
3721   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3722                               &punsignedp, &reversep, &pvolatilep);
3723   gcc_assert (base && !reversep);
3724   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3725
3726   if (TREE_CODE (base) == MEM_REF)
3727     {
3728       if (!integer_zerop (TREE_OPERAND (base, 1)))
3729         {
3730           if (off == NULL_TREE)
3731             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3732           else
3733             off = size_binop (PLUS_EXPR, off,
3734                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3735         }
3736       base = TREE_OPERAND (base, 0);
3737     }
3738   else
3739     base = build_fold_addr_expr (base);
3740
3741   if (off == NULL_TREE)
3742     off = size_zero_node;
3743
3744   /* If base is not loop invariant, either off is 0, then we start with just
3745      the constant offset in the loop invariant BASE and continue with base
3746      as OFF, otherwise give up.
3747      We could handle that case by gimplifying the addition of base + off
3748      into some SSA_NAME and use that as off, but for now punt.  */
3749   if (!expr_invariant_in_loop_p (loop, base))
3750     {
3751       if (!integer_zerop (off))
3752         return false;
3753       off = base;
3754       base = size_int (pbytepos);
3755     }
3756   /* Otherwise put base + constant offset into the loop invariant BASE
3757      and continue with OFF.  */
3758   else
3759     {
3760       base = fold_convert (sizetype, base);
3761       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3762     }
3763
3764   /* OFF at this point may be either a SSA_NAME or some tree expression
3765      from get_inner_reference.  Try to peel off loop invariants from it
3766      into BASE as long as possible.  */
3767   STRIP_NOPS (off);
3768   while (offtype == NULL_TREE)
3769     {
3770       enum tree_code code;
3771       tree op0, op1, add = NULL_TREE;
3772
3773       if (TREE_CODE (off) == SSA_NAME)
3774         {
3775           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3776
3777           if (expr_invariant_in_loop_p (loop, off))
3778             return false;
3779
3780           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3781             break;
3782
3783           op0 = gimple_assign_rhs1 (def_stmt);
3784           code = gimple_assign_rhs_code (def_stmt);
3785           op1 = gimple_assign_rhs2 (def_stmt);
3786         }
3787       else
3788         {
3789           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3790             return false;
3791           code = TREE_CODE (off);
3792           extract_ops_from_tree (off, &code, &op0, &op1);
3793         }
3794       switch (code)
3795         {
3796         case POINTER_PLUS_EXPR:
3797         case PLUS_EXPR:
3798           if (expr_invariant_in_loop_p (loop, op0))
3799             {
3800               add = op0;
3801               off = op1;
3802             do_add:
3803               add = fold_convert (sizetype, add);
3804               if (scale != 1)
3805                 add = size_binop (MULT_EXPR, add, size_int (scale));
3806               base = size_binop (PLUS_EXPR, base, add);
3807               continue;
3808             }
3809           if (expr_invariant_in_loop_p (loop, op1))
3810             {
3811               add = op1;
3812               off = op0;
3813               goto do_add;
3814             }
3815           break;
3816         case MINUS_EXPR:
3817           if (expr_invariant_in_loop_p (loop, op1))
3818             {
3819               add = fold_convert (sizetype, op1);
3820               add = size_binop (MINUS_EXPR, size_zero_node, add);
3821               off = op0;
3822               goto do_add;
3823             }
3824           break;
3825         case MULT_EXPR:
3826           if (scale == 1 && tree_fits_shwi_p (op1))
3827             {
3828               int new_scale = tree_to_shwi (op1);
3829               /* Only treat this as a scaling operation if the target
3830                  supports it.  */
3831               if (use_ifn_p
3832                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3833                                                 vectype, memory_type, 1,
3834                                                 TYPE_SIGN (TREE_TYPE (op0)),
3835                                                 new_scale, &ifn,
3836                                                 &element_type))
3837                 break;
3838               scale = new_scale;
3839               off = op0;
3840               continue;
3841             }
3842           break;
3843         case SSA_NAME:
3844           off = op0;
3845           continue;
3846         CASE_CONVERT:
3847           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3848               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3849             break;
3850           if (TYPE_PRECISION (TREE_TYPE (op0))
3851               == TYPE_PRECISION (TREE_TYPE (off)))
3852             {
3853               off = op0;
3854               continue;
3855             }
3856
3857           /* The internal functions need the offset to be the same width
3858              as the elements of VECTYPE.  Don't include operations that
3859              cast the offset from that width to a different width.  */
3860           if (use_ifn_p
3861               && (int_size_in_bytes (TREE_TYPE (vectype))
3862                   == int_size_in_bytes (TREE_TYPE (off))))
3863             break;
3864
3865           if (TYPE_PRECISION (TREE_TYPE (op0))
3866               < TYPE_PRECISION (TREE_TYPE (off)))
3867             {
3868               off = op0;
3869               offtype = TREE_TYPE (off);
3870               STRIP_NOPS (off);
3871               continue;
3872             }
3873           break;
3874         default:
3875           break;
3876         }
3877       break;
3878     }
3879
3880   /* If at the end OFF still isn't a SSA_NAME or isn't
3881      defined in the loop, punt.  */
3882   if (TREE_CODE (off) != SSA_NAME
3883       || expr_invariant_in_loop_p (loop, off))
3884     return false;
3885
3886   if (offtype == NULL_TREE)
3887     offtype = TREE_TYPE (off);
3888
3889   if (use_ifn_p)
3890     {
3891       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3892                                      memory_type, TYPE_PRECISION (offtype),
3893                                      TYPE_SIGN (offtype), scale, &ifn,
3894                                      &element_type))
3895         return false;
3896     }
3897   else
3898     {
3899       if (DR_IS_READ (dr))
3900         {
3901           if (targetm.vectorize.builtin_gather)
3902             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3903         }
3904       else
3905         {
3906           if (targetm.vectorize.builtin_scatter)
3907             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3908         }
3909
3910       if (!decl)
3911         return false;
3912
3913       ifn = IFN_LAST;
3914       element_type = TREE_TYPE (vectype);
3915     }
3916
3917   info->ifn = ifn;
3918   info->decl = decl;
3919   info->base = base;
3920   info->offset = off;
3921   info->offset_dt = vect_unknown_def_type;
3922   info->offset_vectype = NULL_TREE;
3923   info->scale = scale;
3924   info->element_type = element_type;
3925   info->memory_type = memory_type;
3926   return true;
3927 }
3928
3929 /* Function vect_analyze_data_refs.
3930
3931   Find all the data references in the loop or basic block.
3932
3933    The general structure of the analysis of data refs in the vectorizer is as
3934    follows:
3935    1- vect_analyze_data_refs(loop/bb): call
3936       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3937       in the loop/bb and their dependences.
3938    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3939    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3940    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3941
3942 */
3943
3944 bool
3945 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3946 {
3947   struct loop *loop = NULL;
3948   unsigned int i;
3949   struct data_reference *dr;
3950   tree scalar_type;
3951
3952   if (dump_enabled_p ())
3953     dump_printf_loc (MSG_NOTE, vect_location,
3954                      "=== vect_analyze_data_refs ===\n");
3955
3956   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3957     loop = LOOP_VINFO_LOOP (loop_vinfo);
3958
3959   /* Go through the data-refs, check that the analysis succeeded.  Update
3960      pointer from stmt_vec_info struct to DR and vectype.  */
3961
3962   vec<data_reference_p> datarefs = vinfo->datarefs;
3963   FOR_EACH_VEC_ELT (datarefs, i, dr)
3964     {
3965       gimple *stmt;
3966       stmt_vec_info stmt_info;
3967       tree base, offset, init;
3968       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3969       bool simd_lane_access = false;
3970       poly_uint64 vf;
3971
3972 again:
3973       if (!dr || !DR_REF (dr))
3974         {
3975           if (dump_enabled_p ())
3976             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977                              "not vectorized: unhandled data-ref\n");
3978           return false;
3979         }
3980
3981       stmt = DR_STMT (dr);
3982       stmt_info = vinfo_for_stmt (stmt);
3983
3984       /* Discard clobbers from the dataref vector.  We will remove
3985          clobber stmts during vectorization.  */
3986       if (gimple_clobber_p (stmt))
3987         {
3988           free_data_ref (dr);
3989           if (i == datarefs.length () - 1)
3990             {
3991               datarefs.pop ();
3992               break;
3993             }
3994           datarefs.ordered_remove (i);
3995           dr = datarefs[i];
3996           goto again;
3997         }
3998
3999       /* Check that analysis of the data-ref succeeded.  */
4000       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4001           || !DR_STEP (dr))
4002         {
4003           bool maybe_gather
4004             = DR_IS_READ (dr)
4005               && !TREE_THIS_VOLATILE (DR_REF (dr))
4006               && (targetm.vectorize.builtin_gather != NULL
4007                   || supports_vec_gather_load_p ());
4008           bool maybe_scatter
4009             = DR_IS_WRITE (dr)
4010               && !TREE_THIS_VOLATILE (DR_REF (dr))
4011               && (targetm.vectorize.builtin_scatter != NULL
4012                   || supports_vec_scatter_store_p ());
4013           bool maybe_simd_lane_access
4014             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4015
4016           /* If target supports vector gather loads or scatter stores, or if
4017              this might be a SIMD lane access, see if they can't be used.  */
4018           if (is_a <loop_vec_info> (vinfo)
4019               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4020               && !nested_in_vect_loop_p (loop, stmt))
4021             {
4022               struct data_reference *newdr
4023                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4024                                    DR_REF (dr), stmt, !maybe_scatter,
4025                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4026               gcc_assert (newdr != NULL && DR_REF (newdr));
4027               if (DR_BASE_ADDRESS (newdr)
4028                   && DR_OFFSET (newdr)
4029                   && DR_INIT (newdr)
4030                   && DR_STEP (newdr)
4031                   && integer_zerop (DR_STEP (newdr)))
4032                 {
4033                   if (maybe_simd_lane_access)
4034                     {
4035                       tree off = DR_OFFSET (newdr);
4036                       STRIP_NOPS (off);
4037                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4038                           && TREE_CODE (off) == MULT_EXPR
4039                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4040                         {
4041                           tree step = TREE_OPERAND (off, 1);
4042                           off = TREE_OPERAND (off, 0);
4043                           STRIP_NOPS (off);
4044                           if (CONVERT_EXPR_P (off)
4045                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4046                                                                           0)))
4047                                  < TYPE_PRECISION (TREE_TYPE (off)))
4048                             off = TREE_OPERAND (off, 0);
4049                           if (TREE_CODE (off) == SSA_NAME)
4050                             {
4051                               gimple *def = SSA_NAME_DEF_STMT (off);
4052                               tree reft = TREE_TYPE (DR_REF (newdr));
4053                               if (is_gimple_call (def)
4054                                   && gimple_call_internal_p (def)
4055                                   && (gimple_call_internal_fn (def)
4056                                       == IFN_GOMP_SIMD_LANE))
4057                                 {
4058                                   tree arg = gimple_call_arg (def, 0);
4059                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4060                                   arg = SSA_NAME_VAR (arg);
4061                                   if (arg == loop->simduid
4062                                       /* For now.  */
4063                                       && tree_int_cst_equal
4064                                            (TYPE_SIZE_UNIT (reft),
4065                                             step))
4066                                     {
4067                                       DR_OFFSET (newdr) = ssize_int (0);
4068                                       DR_STEP (newdr) = step;
4069                                       DR_OFFSET_ALIGNMENT (newdr)
4070                                         = BIGGEST_ALIGNMENT;
4071                                       DR_STEP_ALIGNMENT (newdr)
4072                                         = highest_pow2_factor (step);
4073                                       dr = newdr;
4074                                       simd_lane_access = true;
4075                                     }
4076                                 }
4077                             }
4078                         }
4079                     }
4080                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4081                     {
4082                       dr = newdr;
4083                       if (maybe_gather)
4084                         gatherscatter = GATHER;
4085                       else
4086                         gatherscatter = SCATTER;
4087                     }
4088                 }
4089               if (gatherscatter == SG_NONE && !simd_lane_access)
4090                 free_data_ref (newdr);
4091             }
4092
4093           if (gatherscatter == SG_NONE && !simd_lane_access)
4094             {
4095               if (dump_enabled_p ())
4096                 {
4097                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4098                                    "not vectorized: data ref analysis "
4099                                    "failed ");
4100                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4101                 }
4102
4103               if (is_a <bb_vec_info> (vinfo))
4104                 break;
4105
4106               return false;
4107             }
4108         }
4109
4110       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4111         {
4112           if (dump_enabled_p ())
4113             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4114                              "not vectorized: base addr of dr is a "
4115                              "constant\n");
4116
4117           if (is_a <bb_vec_info> (vinfo))
4118             break;
4119
4120           if (gatherscatter != SG_NONE || simd_lane_access)
4121             free_data_ref (dr);
4122           return false;
4123         }
4124
4125       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4126         {
4127           if (dump_enabled_p ())
4128             {
4129               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130                                "not vectorized: volatile type ");
4131               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4132             }
4133
4134           if (is_a <bb_vec_info> (vinfo))
4135             break;
4136
4137           return false;
4138         }
4139
4140       if (stmt_can_throw_internal (stmt))
4141         {
4142           if (dump_enabled_p ())
4143             {
4144               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4145                                "not vectorized: statement can throw an "
4146                                "exception ");
4147               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4148             }
4149
4150           if (is_a <bb_vec_info> (vinfo))
4151             break;
4152
4153           if (gatherscatter != SG_NONE || simd_lane_access)
4154             free_data_ref (dr);
4155           return false;
4156         }
4157
4158       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4159           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4160         {
4161           if (dump_enabled_p ())
4162             {
4163               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4164                                "not vectorized: statement is bitfield "
4165                                "access ");
4166               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4167             }
4168
4169           if (is_a <bb_vec_info> (vinfo))
4170             break;
4171
4172           if (gatherscatter != SG_NONE || simd_lane_access)
4173             free_data_ref (dr);
4174           return false;
4175         }
4176
4177       base = unshare_expr (DR_BASE_ADDRESS (dr));
4178       offset = unshare_expr (DR_OFFSET (dr));
4179       init = unshare_expr (DR_INIT (dr));
4180
4181       if (is_gimple_call (stmt)
4182           && (!gimple_call_internal_p (stmt)
4183               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4184                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4185         {
4186           if (dump_enabled_p ())
4187             {
4188               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4189                                "not vectorized: dr in a call ");
4190               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4191             }
4192
4193           if (is_a <bb_vec_info> (vinfo))
4194             break;
4195
4196           if (gatherscatter != SG_NONE || simd_lane_access)
4197             free_data_ref (dr);
4198           return false;
4199         }
4200
4201       /* Update DR field in stmt_vec_info struct.  */
4202
4203       /* If the dataref is in an inner-loop of the loop that is considered for
4204          for vectorization, we also want to analyze the access relative to
4205          the outer-loop (DR contains information only relative to the
4206          inner-most enclosing loop).  We do that by building a reference to the
4207          first location accessed by the inner-loop, and analyze it relative to
4208          the outer-loop.  */
4209       if (loop && nested_in_vect_loop_p (loop, stmt))
4210         {
4211           /* Build a reference to the first location accessed by the
4212              inner loop: *(BASE + INIT + OFFSET).  By construction,
4213              this address must be invariant in the inner loop, so we
4214              can consider it as being used in the outer loop.  */
4215           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4216                                           init, offset);
4217           tree init_addr = fold_build_pointer_plus (base, init_offset);
4218           tree init_ref = build_fold_indirect_ref (init_addr);
4219
4220           if (dump_enabled_p ())
4221             {
4222               dump_printf_loc (MSG_NOTE, vect_location,
4223                                "analyze in outer loop: ");
4224               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4225               dump_printf (MSG_NOTE, "\n");
4226             }
4227
4228           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4229                                      init_ref, loop))
4230             /* dr_analyze_innermost already explained the failure.  */
4231             return false;
4232
4233           if (dump_enabled_p ())
4234             {
4235               dump_printf_loc (MSG_NOTE, vect_location,
4236                                "\touter base_address: ");
4237               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4238                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4239               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4240               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4241                                  STMT_VINFO_DR_OFFSET (stmt_info));
4242               dump_printf (MSG_NOTE,
4243                            "\n\touter constant offset from base address: ");
4244               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4245                                  STMT_VINFO_DR_INIT (stmt_info));
4246               dump_printf (MSG_NOTE, "\n\touter step: ");
4247               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4248                                  STMT_VINFO_DR_STEP (stmt_info));
4249               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4250                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4251               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4252                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4253               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4254                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4255               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4256                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4257             }
4258         }
4259
4260       if (STMT_VINFO_DATA_REF (stmt_info))
4261         {
4262           if (dump_enabled_p ())
4263             {
4264               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265                                "not vectorized: more than one data ref "
4266                                "in stmt: ");
4267               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4268             }
4269
4270           if (is_a <bb_vec_info> (vinfo))
4271             break;
4272
4273           if (gatherscatter != SG_NONE || simd_lane_access)
4274             free_data_ref (dr);
4275           return false;
4276         }
4277
4278       STMT_VINFO_DATA_REF (stmt_info) = dr;
4279       if (simd_lane_access)
4280         {
4281           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4282           free_data_ref (datarefs[i]);
4283           datarefs[i] = dr;
4284         }
4285
4286       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4287           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4288           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4289         {
4290           if (dump_enabled_p ())
4291             {
4292               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4293                                "not vectorized: base object not addressable "
4294                                "for stmt: ");
4295               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4296             }
4297           if (is_a <bb_vec_info> (vinfo))
4298             {
4299               /* In BB vectorization the ref can still participate
4300                  in dependence analysis, we just can't vectorize it.  */
4301               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4302               continue;
4303             }
4304           return false;
4305         }
4306
4307       /* Set vectype for STMT.  */
4308       scalar_type = TREE_TYPE (DR_REF (dr));
4309       STMT_VINFO_VECTYPE (stmt_info)
4310         = get_vectype_for_scalar_type (scalar_type);
4311       if (!STMT_VINFO_VECTYPE (stmt_info))
4312         {
4313           if (dump_enabled_p ())
4314             {
4315               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4316                                "not vectorized: no vectype for stmt: ");
4317               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4318               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4319               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4320                                  scalar_type);
4321               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4322             }
4323
4324           if (is_a <bb_vec_info> (vinfo))
4325             {
4326               /* No vector type is fine, the ref can still participate
4327                  in dependence analysis, we just can't vectorize it.  */
4328               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4329               continue;
4330             }
4331
4332           if (gatherscatter != SG_NONE || simd_lane_access)
4333             {
4334               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4335               if (gatherscatter != SG_NONE)
4336                 free_data_ref (dr);
4337             }
4338           return false;
4339         }
4340       else
4341         {
4342           if (dump_enabled_p ())
4343             {
4344               dump_printf_loc (MSG_NOTE, vect_location,
4345                                "got vectype for stmt: ");
4346               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4347               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4348                                  STMT_VINFO_VECTYPE (stmt_info));
4349               dump_printf (MSG_NOTE, "\n");
4350             }
4351         }
4352
4353       /* Adjust the minimal vectorization factor according to the
4354          vector type.  */
4355       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4356       *min_vf = upper_bound (*min_vf, vf);
4357
4358       if (gatherscatter != SG_NONE)
4359         {
4360           gather_scatter_info gs_info;
4361           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4362                                           &gs_info)
4363               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4364             {
4365               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4366               free_data_ref (dr);
4367               if (dump_enabled_p ())
4368                 {
4369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4370                                    (gatherscatter == GATHER) ?
4371                                    "not vectorized: not suitable for gather "
4372                                    "load " :
4373                                    "not vectorized: not suitable for scatter "
4374                                    "store ");
4375                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4376                 }
4377               return false;
4378             }
4379
4380           free_data_ref (datarefs[i]);
4381           datarefs[i] = dr;
4382           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4383         }
4384
4385       else if (is_a <loop_vec_info> (vinfo)
4386                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4387         {
4388           if (nested_in_vect_loop_p (loop, stmt))
4389             {
4390               if (dump_enabled_p ())
4391                 {
4392                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4393                                    "not vectorized: not suitable for strided "
4394                                    "load ");
4395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4396                 }
4397               return false;
4398             }
4399           STMT_VINFO_STRIDED_P (stmt_info) = true;
4400         }
4401     }
4402
4403   /* If we stopped analysis at the first dataref we could not analyze
4404      when trying to vectorize a basic-block mark the rest of the datarefs
4405      as not vectorizable and truncate the vector of datarefs.  That
4406      avoids spending useless time in analyzing their dependence.  */
4407   if (i != datarefs.length ())
4408     {
4409       gcc_assert (is_a <bb_vec_info> (vinfo));
4410       for (unsigned j = i; j < datarefs.length (); ++j)
4411         {
4412           data_reference_p dr = datarefs[j];
4413           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4414           free_data_ref (dr);
4415         }
4416       datarefs.truncate (i);
4417     }
4418
4419   return true;
4420 }
4421
4422
4423 /* Function vect_get_new_vect_var.
4424
4425    Returns a name for a new variable.  The current naming scheme appends the
4426    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4427    the name of vectorizer generated variables, and appends that to NAME if
4428    provided.  */
4429
4430 tree
4431 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4432 {
4433   const char *prefix;
4434   tree new_vect_var;
4435
4436   switch (var_kind)
4437   {
4438   case vect_simple_var:
4439     prefix = "vect";
4440     break;
4441   case vect_scalar_var:
4442     prefix = "stmp";
4443     break;
4444   case vect_mask_var:
4445     prefix = "mask";
4446     break;
4447   case vect_pointer_var:
4448     prefix = "vectp";
4449     break;
4450   default:
4451     gcc_unreachable ();
4452   }
4453
4454   if (name)
4455     {
4456       char* tmp = concat (prefix, "_", name, NULL);
4457       new_vect_var = create_tmp_reg (type, tmp);
4458       free (tmp);
4459     }
4460   else
4461     new_vect_var = create_tmp_reg (type, prefix);
4462
4463   return new_vect_var;
4464 }
4465
4466 /* Like vect_get_new_vect_var but return an SSA name.  */
4467
4468 tree
4469 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4470 {
4471   const char *prefix;
4472   tree new_vect_var;
4473
4474   switch (var_kind)
4475   {
4476   case vect_simple_var:
4477     prefix = "vect";
4478     break;
4479   case vect_scalar_var:
4480     prefix = "stmp";
4481     break;
4482   case vect_pointer_var:
4483     prefix = "vectp";
4484     break;
4485   default:
4486     gcc_unreachable ();
4487   }
4488
4489   if (name)
4490     {
4491       char* tmp = concat (prefix, "_", name, NULL);
4492       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4493       free (tmp);
4494     }
4495   else
4496     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4497
4498   return new_vect_var;
4499 }
4500
4501 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4502
4503 static void
4504 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4505 {
4506   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4507   int misalign = DR_MISALIGNMENT (dr);
4508   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4509     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4510   else
4511     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4512                             DR_TARGET_ALIGNMENT (dr), misalign);
4513 }
4514
4515 /* Function vect_create_addr_base_for_vector_ref.
4516
4517    Create an expression that computes the address of the first memory location
4518    that will be accessed for a data reference.
4519
4520    Input:
4521    STMT: The statement containing the data reference.
4522    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4523    OFFSET: Optional. If supplied, it is be added to the initial address.
4524    LOOP:    Specify relative to which loop-nest should the address be computed.
4525             For example, when the dataref is in an inner-loop nested in an
4526             outer-loop that is now being vectorized, LOOP can be either the
4527             outer-loop, or the inner-loop.  The first memory location accessed
4528             by the following dataref ('in' points to short):
4529
4530                 for (i=0; i<N; i++)
4531                    for (j=0; j<M; j++)
4532                      s += in[i+j]
4533
4534             is as follows:
4535             if LOOP=i_loop:     &in             (relative to i_loop)
4536             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4537    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4538             initial address.  Unlike OFFSET, which is number of elements to
4539             be added, BYTE_OFFSET is measured in bytes.
4540
4541    Output:
4542    1. Return an SSA_NAME whose value is the address of the memory location of
4543       the first vector of the data reference.
4544    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4545       these statement(s) which define the returned SSA_NAME.
4546
4547    FORNOW: We are only handling array accesses with step 1.  */
4548
4549 tree
4550 vect_create_addr_base_for_vector_ref (gimple *stmt,
4551                                       gimple_seq *new_stmt_list,
4552                                       tree offset,
4553                                       tree byte_offset)
4554 {
4555   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4556   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4557   const char *base_name;
4558   tree addr_base;
4559   tree dest;
4560   gimple_seq seq = NULL;
4561   tree vect_ptr_type;
4562   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4563   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4564   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4565
4566   tree data_ref_base = unshare_expr (drb->base_address);
4567   tree base_offset = unshare_expr (drb->offset);
4568   tree init = unshare_expr (drb->init);
4569
4570   if (loop_vinfo)
4571     base_name = get_name (data_ref_base);
4572   else
4573     {
4574       base_offset = ssize_int (0);
4575       init = ssize_int (0);
4576       base_name = get_name (DR_REF (dr));
4577     }
4578
4579   /* Create base_offset */
4580   base_offset = size_binop (PLUS_EXPR,
4581                             fold_convert (sizetype, base_offset),
4582                             fold_convert (sizetype, init));
4583
4584   if (offset)
4585     {
4586       offset = fold_build2 (MULT_EXPR, sizetype,
4587                             fold_convert (sizetype, offset), step);
4588       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4589                                  base_offset, offset);
4590     }
4591   if (byte_offset)
4592     {
4593       byte_offset = fold_convert (sizetype, byte_offset);
4594       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4595                                  base_offset, byte_offset);
4596     }
4597
4598   /* base + base_offset */
4599   if (loop_vinfo)
4600     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4601   else
4602     {
4603       addr_base = build1 (ADDR_EXPR,
4604                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4605                           unshare_expr (DR_REF (dr)));
4606     }
4607
4608   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4609   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4610   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4611   gimple_seq_add_seq (new_stmt_list, seq);
4612
4613   if (DR_PTR_INFO (dr)
4614       && TREE_CODE (addr_base) == SSA_NAME
4615       && !SSA_NAME_PTR_INFO (addr_base))
4616     {
4617       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4618       if (offset || byte_offset)
4619         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4620     }
4621
4622   if (dump_enabled_p ())
4623     {
4624       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4625       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4626       dump_printf (MSG_NOTE, "\n");
4627     }
4628
4629   return addr_base;
4630 }
4631
4632
4633 /* Function vect_create_data_ref_ptr.
4634
4635    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4636    location accessed in the loop by STMT, along with the def-use update
4637    chain to appropriately advance the pointer through the loop iterations.
4638    Also set aliasing information for the pointer.  This pointer is used by
4639    the callers to this function to create a memory reference expression for
4640    vector load/store access.
4641
4642    Input:
4643    1. STMT: a stmt that references memory. Expected to be of the form
4644          GIMPLE_ASSIGN <name, data-ref> or
4645          GIMPLE_ASSIGN <data-ref, name>.
4646    2. AGGR_TYPE: the type of the reference, which should be either a vector
4647         or an array.
4648    3. AT_LOOP: the loop where the vector memref is to be created.
4649    4. OFFSET (optional): an offset to be added to the initial address accessed
4650         by the data-ref in STMT.
4651    5. BSI: location where the new stmts are to be placed if there is no loop
4652    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4653         pointing to the initial address.
4654    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4655         to the initial address accessed by the data-ref in STMT.  This is
4656         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4657         in bytes.
4658    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4659         to the IV during each iteration of the loop.  NULL says to move
4660         by one copy of AGGR_TYPE up or down, depending on the step of the
4661         data reference.
4662
4663    Output:
4664    1. Declare a new ptr to vector_type, and have it point to the base of the
4665       data reference (initial addressed accessed by the data reference).
4666       For example, for vector of type V8HI, the following code is generated:
4667
4668       v8hi *ap;
4669       ap = (v8hi *)initial_address;
4670
4671       if OFFSET is not supplied:
4672          initial_address = &a[init];
4673       if OFFSET is supplied:
4674          initial_address = &a[init + OFFSET];
4675       if BYTE_OFFSET is supplied:
4676          initial_address = &a[init] + BYTE_OFFSET;
4677
4678       Return the initial_address in INITIAL_ADDRESS.
4679
4680    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4681       update the pointer in each iteration of the loop.
4682
4683       Return the increment stmt that updates the pointer in PTR_INCR.
4684
4685    3. Set INV_P to true if the access pattern of the data reference in the
4686       vectorized loop is invariant.  Set it to false otherwise.
4687
4688    4. Return the pointer.  */
4689
4690 tree
4691 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4692                           tree offset, tree *initial_address,
4693                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4694                           bool only_init, bool *inv_p, tree byte_offset,
4695                           tree iv_step)
4696 {
4697   const char *base_name;
4698   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4699   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4700   struct loop *loop = NULL;
4701   bool nested_in_vect_loop = false;
4702   struct loop *containing_loop = NULL;
4703   tree aggr_ptr_type;
4704   tree aggr_ptr;
4705   tree new_temp;
4706   gimple_seq new_stmt_list = NULL;
4707   edge pe = NULL;
4708   basic_block new_bb;
4709   tree aggr_ptr_init;
4710   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4711   tree aptr;
4712   gimple_stmt_iterator incr_gsi;
4713   bool insert_after;
4714   tree indx_before_incr, indx_after_incr;
4715   gimple *incr;
4716   tree step;
4717   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4718
4719   gcc_assert (iv_step != NULL_TREE
4720               || TREE_CODE (aggr_type) == ARRAY_TYPE
4721               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4722
4723   if (loop_vinfo)
4724     {
4725       loop = LOOP_VINFO_LOOP (loop_vinfo);
4726       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4727       containing_loop = (gimple_bb (stmt))->loop_father;
4728       pe = loop_preheader_edge (loop);
4729     }
4730   else
4731     {
4732       gcc_assert (bb_vinfo);
4733       only_init = true;
4734       *ptr_incr = NULL;
4735     }
4736
4737   /* Check the step (evolution) of the load in LOOP, and record
4738      whether it's invariant.  */
4739   step = vect_dr_behavior (dr)->step;
4740   if (integer_zerop (step))
4741     *inv_p = true;
4742   else
4743     *inv_p = false;
4744
4745   /* Create an expression for the first address accessed by this load
4746      in LOOP.  */
4747   base_name = get_name (DR_BASE_ADDRESS (dr));
4748
4749   if (dump_enabled_p ())
4750     {
4751       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4752       dump_printf_loc (MSG_NOTE, vect_location,
4753                        "create %s-pointer variable to type: ",
4754                        get_tree_code_name (TREE_CODE (aggr_type)));
4755       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4756       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4757         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4758       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4759         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4760       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4761         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4762       else
4763         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4764       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4765       dump_printf (MSG_NOTE, "\n");
4766     }
4767
4768   /* (1) Create the new aggregate-pointer variable.
4769      Vector and array types inherit the alias set of their component
4770      type by default so we need to use a ref-all pointer if the data
4771      reference does not conflict with the created aggregated data
4772      reference because it is not addressable.  */
4773   bool need_ref_all = false;
4774   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4775                               get_alias_set (DR_REF (dr))))
4776     need_ref_all = true;
4777   /* Likewise for any of the data references in the stmt group.  */
4778   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4779     {
4780       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4781       do
4782         {
4783           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4784           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4785           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4786                                       get_alias_set (DR_REF (sdr))))
4787             {
4788               need_ref_all = true;
4789               break;
4790             }
4791           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4792         }
4793       while (orig_stmt);
4794     }
4795   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4796                                                need_ref_all);
4797   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4798
4799
4800   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4801      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4802      def-use update cycles for the pointer: one relative to the outer-loop
4803      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4804      to the inner-loop (which is the inner-most loop containing the dataref),
4805      and this is done be step (5) below.
4806
4807      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4808      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4809      redundant.  Steps (3),(4) create the following:
4810
4811         vp0 = &base_addr;
4812         LOOP:   vp1 = phi(vp0,vp2)
4813                 ...
4814                 ...
4815                 vp2 = vp1 + step
4816                 goto LOOP
4817
4818      If there is an inner-loop nested in loop, then step (5) will also be
4819      applied, and an additional update in the inner-loop will be created:
4820
4821         vp0 = &base_addr;
4822         LOOP:   vp1 = phi(vp0,vp2)
4823                 ...
4824         inner:     vp3 = phi(vp1,vp4)
4825                    vp4 = vp3 + inner_step
4826                    if () goto inner
4827                 ...
4828                 vp2 = vp1 + step
4829                 if () goto LOOP   */
4830
4831   /* (2) Calculate the initial address of the aggregate-pointer, and set
4832      the aggregate-pointer to point to it before the loop.  */
4833
4834   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4835
4836   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4837                                                    offset, byte_offset);
4838   if (new_stmt_list)
4839     {
4840       if (pe)
4841         {
4842           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4843           gcc_assert (!new_bb);
4844         }
4845       else
4846         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4847     }
4848
4849   *initial_address = new_temp;
4850   aggr_ptr_init = new_temp;
4851
4852   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4853      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4854      inner-loop nested in LOOP (during outer-loop vectorization).  */
4855
4856   /* No update in loop is required.  */
4857   if (only_init && (!loop_vinfo || at_loop == loop))
4858     aptr = aggr_ptr_init;
4859   else
4860     {
4861       if (iv_step == NULL_TREE)
4862         {
4863           /* The step of the aggregate pointer is the type size.  */
4864           iv_step = TYPE_SIZE_UNIT (aggr_type);
4865           /* One exception to the above is when the scalar step of the load in
4866              LOOP is zero. In this case the step here is also zero.  */
4867           if (*inv_p)
4868             iv_step = size_zero_node;
4869           else if (tree_int_cst_sgn (step) == -1)
4870             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4871         }
4872
4873       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4874
4875       create_iv (aggr_ptr_init,
4876                  fold_convert (aggr_ptr_type, iv_step),
4877                  aggr_ptr, loop, &incr_gsi, insert_after,
4878                  &indx_before_incr, &indx_after_incr);
4879       incr = gsi_stmt (incr_gsi);
4880       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4881
4882       /* Copy the points-to information if it exists. */
4883       if (DR_PTR_INFO (dr))
4884         {
4885           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4886           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4887         }
4888       if (ptr_incr)
4889         *ptr_incr = incr;
4890
4891       aptr = indx_before_incr;
4892     }
4893
4894   if (!nested_in_vect_loop || only_init)
4895     return aptr;
4896
4897
4898   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4899      nested in LOOP, if exists.  */
4900
4901   gcc_assert (nested_in_vect_loop);
4902   if (!only_init)
4903     {
4904       standard_iv_increment_position (containing_loop, &incr_gsi,
4905                                       &insert_after);
4906       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4907                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4908                  &indx_after_incr);
4909       incr = gsi_stmt (incr_gsi);
4910       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4911
4912       /* Copy the points-to information if it exists. */
4913       if (DR_PTR_INFO (dr))
4914         {
4915           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4916           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4917         }
4918       if (ptr_incr)
4919         *ptr_incr = incr;
4920
4921       return indx_before_incr;
4922     }
4923   else
4924     gcc_unreachable ();
4925 }
4926
4927
4928 /* Function bump_vector_ptr
4929
4930    Increment a pointer (to a vector type) by vector-size. If requested,
4931    i.e. if PTR-INCR is given, then also connect the new increment stmt
4932    to the existing def-use update-chain of the pointer, by modifying
4933    the PTR_INCR as illustrated below:
4934
4935    The pointer def-use update-chain before this function:
4936                         DATAREF_PTR = phi (p_0, p_2)
4937                         ....
4938         PTR_INCR:       p_2 = DATAREF_PTR + step
4939
4940    The pointer def-use update-chain after this function:
4941                         DATAREF_PTR = phi (p_0, p_2)
4942                         ....
4943                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4944                         ....
4945         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4946
4947    Input:
4948    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4949                  in the loop.
4950    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4951               the loop.  The increment amount across iterations is expected
4952               to be vector_size.
4953    BSI - location where the new update stmt is to be placed.
4954    STMT - the original scalar memory-access stmt that is being vectorized.
4955    BUMP - optional. The offset by which to bump the pointer. If not given,
4956           the offset is assumed to be vector_size.
4957
4958    Output: Return NEW_DATAREF_PTR as illustrated above.
4959
4960 */
4961
4962 tree
4963 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4964                  gimple *stmt, tree bump)
4965 {
4966   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4967   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4968   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4969   tree update = TYPE_SIZE_UNIT (vectype);
4970   gassign *incr_stmt;
4971   ssa_op_iter iter;
4972   use_operand_p use_p;
4973   tree new_dataref_ptr;
4974
4975   if (bump)
4976     update = bump;
4977
4978   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4979     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4980   else
4981     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4982   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4983                                    dataref_ptr, update);
4984   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4985
4986   /* Copy the points-to information if it exists. */
4987   if (DR_PTR_INFO (dr))
4988     {
4989       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4990       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4991     }
4992
4993   if (!ptr_incr)
4994     return new_dataref_ptr;
4995
4996   /* Update the vector-pointer's cross-iteration increment.  */
4997   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4998     {
4999       tree use = USE_FROM_PTR (use_p);
5000
5001       if (use == dataref_ptr)
5002         SET_USE (use_p, new_dataref_ptr);
5003       else
5004         gcc_assert (operand_equal_p (use, update, 0));
5005     }
5006
5007   return new_dataref_ptr;
5008 }
5009
5010
5011 /* Function vect_create_destination_var.
5012
5013    Create a new temporary of type VECTYPE.  */
5014
5015 tree
5016 vect_create_destination_var (tree scalar_dest, tree vectype)
5017 {
5018   tree vec_dest;
5019   const char *name;
5020   char *new_name;
5021   tree type;
5022   enum vect_var_kind kind;
5023
5024   kind = vectype
5025     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5026     ? vect_mask_var
5027     : vect_simple_var
5028     : vect_scalar_var;
5029   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5030
5031   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5032
5033   name = get_name (scalar_dest);
5034   if (name)
5035     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5036   else
5037     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5038   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5039   free (new_name);
5040
5041   return vec_dest;
5042 }
5043
5044 /* Function vect_grouped_store_supported.
5045
5046    Returns TRUE if interleave high and interleave low permutations
5047    are supported, and FALSE otherwise.  */
5048
5049 bool
5050 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5051 {
5052   machine_mode mode = TYPE_MODE (vectype);
5053
5054   /* vect_permute_store_chain requires the group size to be equal to 3 or
5055      be a power of two.  */
5056   if (count != 3 && exact_log2 (count) == -1)
5057     {
5058       if (dump_enabled_p ())
5059         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060                          "the size of the group of accesses"
5061                          " is not a power of 2 or not eqaul to 3\n");
5062       return false;
5063     }
5064
5065   /* Check that the permutation is supported.  */
5066   if (VECTOR_MODE_P (mode))
5067     {
5068       unsigned int i;
5069       if (count == 3)
5070         {
5071           unsigned int j0 = 0, j1 = 0, j2 = 0;
5072           unsigned int i, j;
5073
5074           unsigned int nelt;
5075           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5076             {
5077               if (dump_enabled_p ())
5078                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079                                  "cannot handle groups of 3 stores for"
5080                                  " variable-length vectors\n");
5081               return false;
5082             }
5083
5084           vec_perm_builder sel (nelt, nelt, 1);
5085           sel.quick_grow (nelt);
5086           vec_perm_indices indices;
5087           for (j = 0; j < 3; j++)
5088             {
5089               int nelt0 = ((3 - j) * nelt) % 3;
5090               int nelt1 = ((3 - j) * nelt + 1) % 3;
5091               int nelt2 = ((3 - j) * nelt + 2) % 3;
5092               for (i = 0; i < nelt; i++)
5093                 {
5094                   if (3 * i + nelt0 < nelt)
5095                     sel[3 * i + nelt0] = j0++;
5096                   if (3 * i + nelt1 < nelt)
5097                     sel[3 * i + nelt1] = nelt + j1++;
5098                   if (3 * i + nelt2 < nelt)
5099                     sel[3 * i + nelt2] = 0;
5100                 }
5101               indices.new_vector (sel, 2, nelt);
5102               if (!can_vec_perm_const_p (mode, indices))
5103                 {
5104                   if (dump_enabled_p ())
5105                     dump_printf (MSG_MISSED_OPTIMIZATION,
5106                                  "permutation op not supported by target.\n");
5107                   return false;
5108                 }
5109
5110               for (i = 0; i < nelt; i++)
5111                 {
5112                   if (3 * i + nelt0 < nelt)
5113                     sel[3 * i + nelt0] = 3 * i + nelt0;
5114                   if (3 * i + nelt1 < nelt)
5115                     sel[3 * i + nelt1] = 3 * i + nelt1;
5116                   if (3 * i + nelt2 < nelt)
5117                     sel[3 * i + nelt2] = nelt + j2++;
5118                 }
5119               indices.new_vector (sel, 2, nelt);
5120               if (!can_vec_perm_const_p (mode, indices))
5121                 {
5122                   if (dump_enabled_p ())
5123                     dump_printf (MSG_MISSED_OPTIMIZATION,
5124                                  "permutation op not supported by target.\n");
5125                   return false;
5126                 }
5127             }
5128           return true;
5129         }
5130       else
5131         {
5132           /* If length is not equal to 3 then only power of 2 is supported.  */
5133           gcc_assert (pow2p_hwi (count));
5134           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5135
5136           /* The encoding has 2 interleaved stepped patterns.  */
5137           vec_perm_builder sel (nelt, 2, 3);
5138           sel.quick_grow (6);
5139           for (i = 0; i < 3; i++)
5140             {
5141               sel[i * 2] = i;
5142               sel[i * 2 + 1] = i + nelt;
5143             }
5144           vec_perm_indices indices (sel, 2, nelt);
5145           if (can_vec_perm_const_p (mode, indices))
5146             {
5147               for (i = 0; i < 6; i++)
5148                 sel[i] += exact_div (nelt, 2);
5149               indices.new_vector (sel, 2, nelt);
5150               if (can_vec_perm_const_p (mode, indices))
5151                 return true;
5152             }
5153         }
5154     }
5155
5156   if (dump_enabled_p ())
5157     dump_printf (MSG_MISSED_OPTIMIZATION,
5158                  "permutaion op not supported by target.\n");
5159   return false;
5160 }
5161
5162
5163 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5164    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5165
5166 bool
5167 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5168                             bool masked_p)
5169 {
5170   if (masked_p)
5171     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5172                                          vec_mask_store_lanes_optab,
5173                                          vectype, count);
5174   else
5175     return vect_lanes_optab_supported_p ("vec_store_lanes",
5176                                          vec_store_lanes_optab,
5177                                          vectype, count);
5178 }
5179
5180
5181 /* Function vect_permute_store_chain.
5182
5183    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5184    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5185    the data correctly for the stores.  Return the final references for stores
5186    in RESULT_CHAIN.
5187
5188    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5189    The input is 4 vectors each containing 8 elements.  We assign a number to
5190    each element, the input sequence is:
5191
5192    1st vec:   0  1  2  3  4  5  6  7
5193    2nd vec:   8  9 10 11 12 13 14 15
5194    3rd vec:  16 17 18 19 20 21 22 23
5195    4th vec:  24 25 26 27 28 29 30 31
5196
5197    The output sequence should be:
5198
5199    1st vec:  0  8 16 24  1  9 17 25
5200    2nd vec:  2 10 18 26  3 11 19 27
5201    3rd vec:  4 12 20 28  5 13 21 30
5202    4th vec:  6 14 22 30  7 15 23 31
5203
5204    i.e., we interleave the contents of the four vectors in their order.
5205
5206    We use interleave_high/low instructions to create such output.  The input of
5207    each interleave_high/low operation is two vectors:
5208    1st vec    2nd vec
5209    0 1 2 3    4 5 6 7
5210    the even elements of the result vector are obtained left-to-right from the
5211    high/low elements of the first vector.  The odd elements of the result are
5212    obtained left-to-right from the high/low elements of the second vector.
5213    The output of interleave_high will be:   0 4 1 5
5214    and of interleave_low:                   2 6 3 7
5215
5216
5217    The permutation is done in log LENGTH stages.  In each stage interleave_high
5218    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5219    where the first argument is taken from the first half of DR_CHAIN and the
5220    second argument from it's second half.
5221    In our example,
5222
5223    I1: interleave_high (1st vec, 3rd vec)
5224    I2: interleave_low (1st vec, 3rd vec)
5225    I3: interleave_high (2nd vec, 4th vec)
5226    I4: interleave_low (2nd vec, 4th vec)
5227
5228    The output for the first stage is:
5229
5230    I1:  0 16  1 17  2 18  3 19
5231    I2:  4 20  5 21  6 22  7 23
5232    I3:  8 24  9 25 10 26 11 27
5233    I4: 12 28 13 29 14 30 15 31
5234
5235    The output of the second stage, i.e. the final result is:
5236
5237    I1:  0  8 16 24  1  9 17 25
5238    I2:  2 10 18 26  3 11 19 27
5239    I3:  4 12 20 28  5 13 21 30
5240    I4:  6 14 22 30  7 15 23 31.  */
5241
5242 void
5243 vect_permute_store_chain (vec<tree> dr_chain,
5244                           unsigned int length,
5245                           gimple *stmt,
5246                           gimple_stmt_iterator *gsi,
5247                           vec<tree> *result_chain)
5248 {
5249   tree vect1, vect2, high, low;
5250   gimple *perm_stmt;
5251   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5252   tree perm_mask_low, perm_mask_high;
5253   tree data_ref;
5254   tree perm3_mask_low, perm3_mask_high;
5255   unsigned int i, j, n, log_length = exact_log2 (length);
5256
5257   result_chain->quick_grow (length);
5258   memcpy (result_chain->address (), dr_chain.address (),
5259           length * sizeof (tree));
5260
5261   if (length == 3)
5262     {
5263       /* vect_grouped_store_supported ensures that this is constant.  */
5264       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5265       unsigned int j0 = 0, j1 = 0, j2 = 0;
5266
5267       vec_perm_builder sel (nelt, nelt, 1);
5268       sel.quick_grow (nelt);
5269       vec_perm_indices indices;
5270       for (j = 0; j < 3; j++)
5271         {
5272           int nelt0 = ((3 - j) * nelt) % 3;
5273           int nelt1 = ((3 - j) * nelt + 1) % 3;
5274           int nelt2 = ((3 - j) * nelt + 2) % 3;
5275
5276           for (i = 0; i < nelt; i++)
5277             {
5278               if (3 * i + nelt0 < nelt)
5279                 sel[3 * i + nelt0] = j0++;
5280               if (3 * i + nelt1 < nelt)
5281                 sel[3 * i + nelt1] = nelt + j1++;
5282               if (3 * i + nelt2 < nelt)
5283                 sel[3 * i + nelt2] = 0;
5284             }
5285           indices.new_vector (sel, 2, nelt);
5286           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5287
5288           for (i = 0; i < nelt; i++)
5289             {
5290               if (3 * i + nelt0 < nelt)
5291                 sel[3 * i + nelt0] = 3 * i + nelt0;
5292               if (3 * i + nelt1 < nelt)
5293                 sel[3 * i + nelt1] = 3 * i + nelt1;
5294               if (3 * i + nelt2 < nelt)
5295                 sel[3 * i + nelt2] = nelt + j2++;
5296             }
5297           indices.new_vector (sel, 2, nelt);
5298           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5299
5300           vect1 = dr_chain[0];
5301           vect2 = dr_chain[1];
5302
5303           /* Create interleaving stmt:
5304              low = VEC_PERM_EXPR <vect1, vect2,
5305                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5306                                    j + 2, nelt + j + 2, *, ...}>  */
5307           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5308           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5309                                            vect2, perm3_mask_low);
5310           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5311
5312           vect1 = data_ref;
5313           vect2 = dr_chain[2];
5314           /* Create interleaving stmt:
5315              low = VEC_PERM_EXPR <vect1, vect2,
5316                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5317                                    6, 7, nelt + j + 2, ...}>  */
5318           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5319           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5320                                            vect2, perm3_mask_high);
5321           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322           (*result_chain)[j] = data_ref;
5323         }
5324     }
5325   else
5326     {
5327       /* If length is not equal to 3 then only power of 2 is supported.  */
5328       gcc_assert (pow2p_hwi (length));
5329
5330       /* The encoding has 2 interleaved stepped patterns.  */
5331       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5332       vec_perm_builder sel (nelt, 2, 3);
5333       sel.quick_grow (6);
5334       for (i = 0; i < 3; i++)
5335         {
5336           sel[i * 2] = i;
5337           sel[i * 2 + 1] = i + nelt;
5338         }
5339         vec_perm_indices indices (sel, 2, nelt);
5340         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5341
5342         for (i = 0; i < 6; i++)
5343           sel[i] += exact_div (nelt, 2);
5344         indices.new_vector (sel, 2, nelt);
5345         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5346
5347         for (i = 0, n = log_length; i < n; i++)
5348           {
5349             for (j = 0; j < length/2; j++)
5350               {
5351                 vect1 = dr_chain[j];
5352                 vect2 = dr_chain[j+length/2];
5353
5354                 /* Create interleaving stmt:
5355                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5356                                                         ...}>  */
5357                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5358                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5359                                                  vect2, perm_mask_high);
5360                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5361                 (*result_chain)[2*j] = high;
5362
5363                 /* Create interleaving stmt:
5364                    low = VEC_PERM_EXPR <vect1, vect2,
5365                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5366                                          ...}>  */
5367                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5368                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5369                                                  vect2, perm_mask_low);
5370                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371                 (*result_chain)[2*j+1] = low;
5372               }
5373             memcpy (dr_chain.address (), result_chain->address (),
5374                     length * sizeof (tree));
5375           }
5376     }
5377 }
5378
5379 /* Function vect_setup_realignment
5380
5381    This function is called when vectorizing an unaligned load using
5382    the dr_explicit_realign[_optimized] scheme.
5383    This function generates the following code at the loop prolog:
5384
5385       p = initial_addr;
5386    x  msq_init = *(floor(p));   # prolog load
5387       realignment_token = call target_builtin;
5388     loop:
5389    x  msq = phi (msq_init, ---)
5390
5391    The stmts marked with x are generated only for the case of
5392    dr_explicit_realign_optimized.
5393
5394    The code above sets up a new (vector) pointer, pointing to the first
5395    location accessed by STMT, and a "floor-aligned" load using that pointer.
5396    It also generates code to compute the "realignment-token" (if the relevant
5397    target hook was defined), and creates a phi-node at the loop-header bb
5398    whose arguments are the result of the prolog-load (created by this
5399    function) and the result of a load that takes place in the loop (to be
5400    created by the caller to this function).
5401
5402    For the case of dr_explicit_realign_optimized:
5403    The caller to this function uses the phi-result (msq) to create the
5404    realignment code inside the loop, and sets up the missing phi argument,
5405    as follows:
5406     loop:
5407       msq = phi (msq_init, lsq)
5408       lsq = *(floor(p'));        # load in loop
5409       result = realign_load (msq, lsq, realignment_token);
5410
5411    For the case of dr_explicit_realign:
5412     loop:
5413       msq = *(floor(p));        # load in loop
5414       p' = p + (VS-1);
5415       lsq = *(floor(p'));       # load in loop
5416       result = realign_load (msq, lsq, realignment_token);
5417
5418    Input:
5419    STMT - (scalar) load stmt to be vectorized. This load accesses
5420           a memory location that may be unaligned.
5421    BSI - place where new code is to be inserted.
5422    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5423                               is used.
5424
5425    Output:
5426    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5427                        target hook, if defined.
5428    Return value - the result of the loop-header phi node.  */
5429
5430 tree
5431 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5432                         tree *realignment_token,
5433                         enum dr_alignment_support alignment_support_scheme,
5434                         tree init_addr,
5435                         struct loop **at_loop)
5436 {
5437   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5438   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5441   struct loop *loop = NULL;
5442   edge pe = NULL;
5443   tree scalar_dest = gimple_assign_lhs (stmt);
5444   tree vec_dest;
5445   gimple *inc;
5446   tree ptr;
5447   tree data_ref;
5448   basic_block new_bb;
5449   tree msq_init = NULL_TREE;
5450   tree new_temp;
5451   gphi *phi_stmt;
5452   tree msq = NULL_TREE;
5453   gimple_seq stmts = NULL;
5454   bool inv_p;
5455   bool compute_in_loop = false;
5456   bool nested_in_vect_loop = false;
5457   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5458   struct loop *loop_for_initial_load = NULL;
5459
5460   if (loop_vinfo)
5461     {
5462       loop = LOOP_VINFO_LOOP (loop_vinfo);
5463       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5464     }
5465
5466   gcc_assert (alignment_support_scheme == dr_explicit_realign
5467               || alignment_support_scheme == dr_explicit_realign_optimized);
5468
5469   /* We need to generate three things:
5470      1. the misalignment computation
5471      2. the extra vector load (for the optimized realignment scheme).
5472      3. the phi node for the two vectors from which the realignment is
5473       done (for the optimized realignment scheme).  */
5474
5475   /* 1. Determine where to generate the misalignment computation.
5476
5477      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478      calculation will be generated by this function, outside the loop (in the
5479      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5480      caller, inside the loop.
5481
5482      Background: If the misalignment remains fixed throughout the iterations of
5483      the loop, then both realignment schemes are applicable, and also the
5484      misalignment computation can be done outside LOOP.  This is because we are
5485      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486      are a multiple of VS (the Vector Size), and therefore the misalignment in
5487      different vectorized LOOP iterations is always the same.
5488      The problem arises only if the memory access is in an inner-loop nested
5489      inside LOOP, which is now being vectorized using outer-loop vectorization.
5490      This is the only case when the misalignment of the memory access may not
5491      remain fixed throughout the iterations of the inner-loop (as explained in
5492      detail in vect_supportable_dr_alignment).  In this case, not only is the
5493      optimized realignment scheme not applicable, but also the misalignment
5494      computation (and generation of the realignment token that is passed to
5495      REALIGN_LOAD) have to be done inside the loop.
5496
5497      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498      or not, which in turn determines if the misalignment is computed inside
5499      the inner-loop, or outside LOOP.  */
5500
5501   if (init_addr != NULL_TREE || !loop_vinfo)
5502     {
5503       compute_in_loop = true;
5504       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5505     }
5506
5507
5508   /* 2. Determine where to generate the extra vector load.
5509
5510      For the optimized realignment scheme, instead of generating two vector
5511      loads in each iteration, we generate a single extra vector load in the
5512      preheader of the loop, and in each iteration reuse the result of the
5513      vector load from the previous iteration.  In case the memory access is in
5514      an inner-loop nested inside LOOP, which is now being vectorized using
5515      outer-loop vectorization, we need to determine whether this initial vector
5516      load should be generated at the preheader of the inner-loop, or can be
5517      generated at the preheader of LOOP.  If the memory access has no evolution
5518      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519      to be generated inside LOOP (in the preheader of the inner-loop).  */
5520
5521   if (nested_in_vect_loop)
5522     {
5523       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524       bool invariant_in_outerloop =
5525             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5527     }
5528   else
5529     loop_for_initial_load = loop;
5530   if (at_loop)
5531     *at_loop = loop_for_initial_load;
5532
5533   if (loop_for_initial_load)
5534     pe = loop_preheader_edge (loop_for_initial_load);
5535
5536   /* 3. For the case of the optimized realignment, create the first vector
5537       load at the loop preheader.  */
5538
5539   if (alignment_support_scheme == dr_explicit_realign_optimized)
5540     {
5541       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5542       gassign *new_stmt;
5543
5544       gcc_assert (!compute_in_loop);
5545       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5547                                       NULL_TREE, &init_addr, NULL, &inc,
5548                                       true, &inv_p);
5549       if (TREE_CODE (ptr) == SSA_NAME)
5550         new_temp = copy_ssa_name (ptr);
5551       else
5552         new_temp = make_ssa_name (TREE_TYPE (ptr));
5553       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5554       new_stmt = gimple_build_assign
5555                    (new_temp, BIT_AND_EXPR, ptr,
5556                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558       gcc_assert (!new_bb);
5559       data_ref
5560         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562       new_stmt = gimple_build_assign (vec_dest, data_ref);
5563       new_temp = make_ssa_name (vec_dest, new_stmt);
5564       gimple_assign_set_lhs (new_stmt, new_temp);
5565       if (pe)
5566         {
5567           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5568           gcc_assert (!new_bb);
5569         }
5570       else
5571          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5572
5573       msq_init = gimple_assign_lhs (new_stmt);
5574     }
5575
5576   /* 4. Create realignment token using a target builtin, if available.
5577       It is done either inside the containing loop, or before LOOP (as
5578       determined above).  */
5579
5580   if (targetm.vectorize.builtin_mask_for_load)
5581     {
5582       gcall *new_stmt;
5583       tree builtin_decl;
5584
5585       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5586       if (!init_addr)
5587         {
5588           /* Generate the INIT_ADDR computation outside LOOP.  */
5589           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5590                                                             NULL_TREE);
5591           if (loop)
5592             {
5593               pe = loop_preheader_edge (loop);
5594               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5595               gcc_assert (!new_bb);
5596             }
5597           else
5598              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5599         }
5600
5601       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5602       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5603       vec_dest =
5604         vect_create_destination_var (scalar_dest,
5605                                      gimple_call_return_type (new_stmt));
5606       new_temp = make_ssa_name (vec_dest, new_stmt);
5607       gimple_call_set_lhs (new_stmt, new_temp);
5608
5609       if (compute_in_loop)
5610         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5611       else
5612         {
5613           /* Generate the misalignment computation outside LOOP.  */
5614           pe = loop_preheader_edge (loop);
5615           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5616           gcc_assert (!new_bb);
5617         }
5618
5619       *realignment_token = gimple_call_lhs (new_stmt);
5620
5621       /* The result of the CALL_EXPR to this builtin is determined from
5622          the value of the parameter and no global variables are touched
5623          which makes the builtin a "const" function.  Requiring the
5624          builtin to have the "const" attribute makes it unnecessary
5625          to call mark_call_clobbered.  */
5626       gcc_assert (TREE_READONLY (builtin_decl));
5627     }
5628
5629   if (alignment_support_scheme == dr_explicit_realign)
5630     return msq;
5631
5632   gcc_assert (!compute_in_loop);
5633   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5634
5635
5636   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5637
5638   pe = loop_preheader_edge (containing_loop);
5639   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5640   msq = make_ssa_name (vec_dest);
5641   phi_stmt = create_phi_node (msq, containing_loop->header);
5642   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5643
5644   return msq;
5645 }
5646
5647
5648 /* Function vect_grouped_load_supported.
5649
5650    COUNT is the size of the load group (the number of statements plus the
5651    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5652    only one statement, with a gap of COUNT - 1.
5653
5654    Returns true if a suitable permute exists.  */
5655
5656 bool
5657 vect_grouped_load_supported (tree vectype, bool single_element_p,
5658                              unsigned HOST_WIDE_INT count)
5659 {
5660   machine_mode mode = TYPE_MODE (vectype);
5661
5662   /* If this is single-element interleaving with an element distance
5663      that leaves unused vector loads around punt - we at least create
5664      very sub-optimal code in that case (and blow up memory,
5665      see PR65518).  */
5666   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5667     {
5668       if (dump_enabled_p ())
5669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5670                          "single-element interleaving not supported "
5671                          "for not adjacent vector loads\n");
5672       return false;
5673     }
5674
5675   /* vect_permute_load_chain requires the group size to be equal to 3 or
5676      be a power of two.  */
5677   if (count != 3 && exact_log2 (count) == -1)
5678     {
5679       if (dump_enabled_p ())
5680         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681                          "the size of the group of accesses"
5682                          " is not a power of 2 or not equal to 3\n");
5683       return false;
5684     }
5685
5686   /* Check that the permutation is supported.  */
5687   if (VECTOR_MODE_P (mode))
5688     {
5689       unsigned int i, j;
5690       if (count == 3)
5691         {
5692           unsigned int nelt;
5693           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5694             {
5695               if (dump_enabled_p ())
5696                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5697                                  "cannot handle groups of 3 loads for"
5698                                  " variable-length vectors\n");
5699               return false;
5700             }
5701
5702           vec_perm_builder sel (nelt, nelt, 1);
5703           sel.quick_grow (nelt);
5704           vec_perm_indices indices;
5705           unsigned int k;
5706           for (k = 0; k < 3; k++)
5707             {
5708               for (i = 0; i < nelt; i++)
5709                 if (3 * i + k < 2 * nelt)
5710                   sel[i] = 3 * i + k;
5711                 else
5712                   sel[i] = 0;
5713               indices.new_vector (sel, 2, nelt);
5714               if (!can_vec_perm_const_p (mode, indices))
5715                 {
5716                   if (dump_enabled_p ())
5717                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5718                                      "shuffle of 3 loads is not supported by"
5719                                      " target\n");
5720                   return false;
5721                 }
5722               for (i = 0, j = 0; i < nelt; i++)
5723                 if (3 * i + k < 2 * nelt)
5724                   sel[i] = i;
5725                 else
5726                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5727               indices.new_vector (sel, 2, nelt);
5728               if (!can_vec_perm_const_p (mode, indices))
5729                 {
5730                   if (dump_enabled_p ())
5731                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5732                                      "shuffle of 3 loads is not supported by"
5733                                      " target\n");
5734                   return false;
5735                 }
5736             }
5737           return true;
5738         }
5739       else
5740         {
5741           /* If length is not equal to 3 then only power of 2 is supported.  */
5742           gcc_assert (pow2p_hwi (count));
5743           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5744
5745           /* The encoding has a single stepped pattern.  */
5746           vec_perm_builder sel (nelt, 1, 3);
5747           sel.quick_grow (3);
5748           for (i = 0; i < 3; i++)
5749             sel[i] = i * 2;
5750           vec_perm_indices indices (sel, 2, nelt);
5751           if (can_vec_perm_const_p (mode, indices))
5752             {
5753               for (i = 0; i < 3; i++)
5754                 sel[i] = i * 2 + 1;
5755               indices.new_vector (sel, 2, nelt);
5756               if (can_vec_perm_const_p (mode, indices))
5757                 return true;
5758             }
5759         }
5760     }
5761
5762   if (dump_enabled_p ())
5763     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764                      "extract even/odd not supported by target\n");
5765   return false;
5766 }
5767
5768 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5769    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5770
5771 bool
5772 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5773                            bool masked_p)
5774 {
5775   if (masked_p)
5776     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5777                                          vec_mask_load_lanes_optab,
5778                                          vectype, count);
5779   else
5780     return vect_lanes_optab_supported_p ("vec_load_lanes",
5781                                          vec_load_lanes_optab,
5782                                          vectype, count);
5783 }
5784
5785 /* Function vect_permute_load_chain.
5786
5787    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5788    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5789    the input data correctly.  Return the final references for loads in
5790    RESULT_CHAIN.
5791
5792    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5793    The input is 4 vectors each containing 8 elements. We assign a number to each
5794    element, the input sequence is:
5795
5796    1st vec:   0  1  2  3  4  5  6  7
5797    2nd vec:   8  9 10 11 12 13 14 15
5798    3rd vec:  16 17 18 19 20 21 22 23
5799    4th vec:  24 25 26 27 28 29 30 31
5800
5801    The output sequence should be:
5802
5803    1st vec:  0 4  8 12 16 20 24 28
5804    2nd vec:  1 5  9 13 17 21 25 29
5805    3rd vec:  2 6 10 14 18 22 26 30
5806    4th vec:  3 7 11 15 19 23 27 31
5807
5808    i.e., the first output vector should contain the first elements of each
5809    interleaving group, etc.
5810
5811    We use extract_even/odd instructions to create such output.  The input of
5812    each extract_even/odd operation is two vectors
5813    1st vec    2nd vec
5814    0 1 2 3    4 5 6 7
5815
5816    and the output is the vector of extracted even/odd elements.  The output of
5817    extract_even will be:   0 2 4 6
5818    and of extract_odd:     1 3 5 7
5819
5820
5821    The permutation is done in log LENGTH stages.  In each stage extract_even
5822    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5823    their order.  In our example,
5824
5825    E1: extract_even (1st vec, 2nd vec)
5826    E2: extract_odd (1st vec, 2nd vec)
5827    E3: extract_even (3rd vec, 4th vec)
5828    E4: extract_odd (3rd vec, 4th vec)
5829
5830    The output for the first stage will be:
5831
5832    E1:  0  2  4  6  8 10 12 14
5833    E2:  1  3  5  7  9 11 13 15
5834    E3: 16 18 20 22 24 26 28 30
5835    E4: 17 19 21 23 25 27 29 31
5836
5837    In order to proceed and create the correct sequence for the next stage (or
5838    for the correct output, if the second stage is the last one, as in our
5839    example), we first put the output of extract_even operation and then the
5840    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5841    The input for the second stage is:
5842
5843    1st vec (E1):  0  2  4  6  8 10 12 14
5844    2nd vec (E3): 16 18 20 22 24 26 28 30
5845    3rd vec (E2):  1  3  5  7  9 11 13 15
5846    4th vec (E4): 17 19 21 23 25 27 29 31
5847
5848    The output of the second stage:
5849
5850    E1: 0 4  8 12 16 20 24 28
5851    E2: 2 6 10 14 18 22 26 30
5852    E3: 1 5  9 13 17 21 25 29
5853    E4: 3 7 11 15 19 23 27 31
5854
5855    And RESULT_CHAIN after reordering:
5856
5857    1st vec (E1):  0 4  8 12 16 20 24 28
5858    2nd vec (E3):  1 5  9 13 17 21 25 29
5859    3rd vec (E2):  2 6 10 14 18 22 26 30
5860    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5861
5862 static void
5863 vect_permute_load_chain (vec<tree> dr_chain,
5864                          unsigned int length,
5865                          gimple *stmt,
5866                          gimple_stmt_iterator *gsi,
5867                          vec<tree> *result_chain)
5868 {
5869   tree data_ref, first_vect, second_vect;
5870   tree perm_mask_even, perm_mask_odd;
5871   tree perm3_mask_low, perm3_mask_high;
5872   gimple *perm_stmt;
5873   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5874   unsigned int i, j, log_length = exact_log2 (length);
5875
5876   result_chain->quick_grow (length);
5877   memcpy (result_chain->address (), dr_chain.address (),
5878           length * sizeof (tree));
5879
5880   if (length == 3)
5881     {
5882       /* vect_grouped_load_supported ensures that this is constant.  */
5883       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5884       unsigned int k;
5885
5886       vec_perm_builder sel (nelt, nelt, 1);
5887       sel.quick_grow (nelt);
5888       vec_perm_indices indices;
5889       for (k = 0; k < 3; k++)
5890         {
5891           for (i = 0; i < nelt; i++)
5892             if (3 * i + k < 2 * nelt)
5893               sel[i] = 3 * i + k;
5894             else
5895               sel[i] = 0;
5896           indices.new_vector (sel, 2, nelt);
5897           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5898
5899           for (i = 0, j = 0; i < nelt; i++)
5900             if (3 * i + k < 2 * nelt)
5901               sel[i] = i;
5902             else
5903               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5904           indices.new_vector (sel, 2, nelt);
5905           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5906
5907           first_vect = dr_chain[0];
5908           second_vect = dr_chain[1];
5909
5910           /* Create interleaving stmt (low part of):
5911              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5912                                                              ...}>  */
5913           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5914           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5915                                            second_vect, perm3_mask_low);
5916           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5917
5918           /* Create interleaving stmt (high part of):
5919              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5920                                                               ...}>  */
5921           first_vect = data_ref;
5922           second_vect = dr_chain[2];
5923           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5924           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5925                                            second_vect, perm3_mask_high);
5926           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5927           (*result_chain)[k] = data_ref;
5928         }
5929     }
5930   else
5931     {
5932       /* If length is not equal to 3 then only power of 2 is supported.  */
5933       gcc_assert (pow2p_hwi (length));
5934
5935       /* The encoding has a single stepped pattern.  */
5936       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5937       vec_perm_builder sel (nelt, 1, 3);
5938       sel.quick_grow (3);
5939       for (i = 0; i < 3; ++i)
5940         sel[i] = i * 2;
5941       vec_perm_indices indices (sel, 2, nelt);
5942       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5943
5944       for (i = 0; i < 3; ++i)
5945         sel[i] = i * 2 + 1;
5946       indices.new_vector (sel, 2, nelt);
5947       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5948
5949       for (i = 0; i < log_length; i++)
5950         {
5951           for (j = 0; j < length; j += 2)
5952             {
5953               first_vect = dr_chain[j];
5954               second_vect = dr_chain[j+1];
5955
5956               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5957               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5958               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5959                                                first_vect, second_vect,
5960                                                perm_mask_even);
5961               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5962               (*result_chain)[j/2] = data_ref;
5963
5964               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5965               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5966               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5967                                                first_vect, second_vect,
5968                                                perm_mask_odd);
5969               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5970               (*result_chain)[j/2+length/2] = data_ref;
5971             }
5972           memcpy (dr_chain.address (), result_chain->address (),
5973                   length * sizeof (tree));
5974         }
5975     }
5976 }
5977
5978 /* Function vect_shift_permute_load_chain.
5979
5980    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5981    sequence of stmts to reorder the input data accordingly.
5982    Return the final references for loads in RESULT_CHAIN.
5983    Return true if successed, false otherwise.
5984
5985    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5986    The input is 3 vectors each containing 8 elements.  We assign a
5987    number to each element, the input sequence is:
5988
5989    1st vec:   0  1  2  3  4  5  6  7
5990    2nd vec:   8  9 10 11 12 13 14 15
5991    3rd vec:  16 17 18 19 20 21 22 23
5992
5993    The output sequence should be:
5994
5995    1st vec:  0 3 6  9 12 15 18 21
5996    2nd vec:  1 4 7 10 13 16 19 22
5997    3rd vec:  2 5 8 11 14 17 20 23
5998
5999    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6000
6001    First we shuffle all 3 vectors to get correct elements order:
6002
6003    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6004    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6005    3rd vec:  (16 19 22) (17 20 23) (18 21)
6006
6007    Next we unite and shift vector 3 times:
6008
6009    1st step:
6010      shift right by 6 the concatenation of:
6011      "1st vec" and  "2nd vec"
6012        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6013      "2nd vec" and  "3rd vec"
6014        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6015      "3rd vec" and  "1st vec"
6016        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6017                              | New vectors                   |
6018
6019      So that now new vectors are:
6020
6021      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6022      2nd vec:  (10 13) (16 19 22) (17 20 23)
6023      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6024
6025    2nd step:
6026      shift right by 5 the concatenation of:
6027      "1st vec" and  "3rd vec"
6028        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6029      "2nd vec" and  "1st vec"
6030        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6031      "3rd vec" and  "2nd vec"
6032        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6033                           | New vectors                   |
6034
6035      So that now new vectors are:
6036
6037      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6038      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6039      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6040
6041    3rd step:
6042      shift right by 5 the concatenation of:
6043      "1st vec" and  "1st vec"
6044        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6045      shift right by 3 the concatenation of:
6046      "2nd vec" and  "2nd vec"
6047                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6048                           | New vectors                   |
6049
6050      So that now all vectors are READY:
6051      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6052      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6053      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6054
6055    This algorithm is faster than one in vect_permute_load_chain if:
6056      1.  "shift of a concatination" is faster than general permutation.
6057          This is usually so.
6058      2.  The TARGET machine can't execute vector instructions in parallel.
6059          This is because each step of the algorithm depends on previous.
6060          The algorithm in vect_permute_load_chain is much more parallel.
6061
6062    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6063 */
6064
6065 static bool
6066 vect_shift_permute_load_chain (vec<tree> dr_chain,
6067                                unsigned int length,
6068                                gimple *stmt,
6069                                gimple_stmt_iterator *gsi,
6070                                vec<tree> *result_chain)
6071 {
6072   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6073   tree perm2_mask1, perm2_mask2, perm3_mask;
6074   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6075   gimple *perm_stmt;
6076
6077   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6078   unsigned int i;
6079   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6080   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6081
6082   unsigned HOST_WIDE_INT nelt, vf;
6083   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6084       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6085     /* Not supported for variable-length vectors.  */
6086     return false;
6087
6088   vec_perm_builder sel (nelt, nelt, 1);
6089   sel.quick_grow (nelt);
6090
6091   result_chain->quick_grow (length);
6092   memcpy (result_chain->address (), dr_chain.address (),
6093           length * sizeof (tree));
6094
6095   if (pow2p_hwi (length) && vf > 4)
6096     {
6097       unsigned int j, log_length = exact_log2 (length);
6098       for (i = 0; i < nelt / 2; ++i)
6099         sel[i] = i * 2;
6100       for (i = 0; i < nelt / 2; ++i)
6101         sel[nelt / 2 + i] = i * 2 + 1;
6102       vec_perm_indices indices (sel, 2, nelt);
6103       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6104         {
6105           if (dump_enabled_p ())
6106             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6107                              "shuffle of 2 fields structure is not \
6108                               supported by target\n");
6109           return false;
6110         }
6111       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6112
6113       for (i = 0; i < nelt / 2; ++i)
6114         sel[i] = i * 2 + 1;
6115       for (i = 0; i < nelt / 2; ++i)
6116         sel[nelt / 2 + i] = i * 2;
6117       indices.new_vector (sel, 2, nelt);
6118       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6119         {
6120           if (dump_enabled_p ())
6121             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6122                              "shuffle of 2 fields structure is not \
6123                               supported by target\n");
6124           return false;
6125         }
6126       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6127
6128       /* Generating permutation constant to shift all elements.
6129          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6130       for (i = 0; i < nelt; i++)
6131         sel[i] = nelt / 2 + i;
6132       indices.new_vector (sel, 2, nelt);
6133       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6134         {
6135           if (dump_enabled_p ())
6136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137                              "shift permutation is not supported by target\n");
6138           return false;
6139         }
6140       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6141
6142       /* Generating permutation constant to select vector from 2.
6143          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6144       for (i = 0; i < nelt / 2; i++)
6145         sel[i] = i;
6146       for (i = nelt / 2; i < nelt; i++)
6147         sel[i] = nelt + i;
6148       indices.new_vector (sel, 2, nelt);
6149       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6150         {
6151           if (dump_enabled_p ())
6152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153                              "select is not supported by target\n");
6154           return false;
6155         }
6156       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6157
6158       for (i = 0; i < log_length; i++)
6159         {
6160           for (j = 0; j < length; j += 2)
6161             {
6162               first_vect = dr_chain[j];
6163               second_vect = dr_chain[j + 1];
6164
6165               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6166               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6167                                                first_vect, first_vect,
6168                                                perm2_mask1);
6169               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6170               vect[0] = data_ref;
6171
6172               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6173               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6174                                                second_vect, second_vect,
6175                                                perm2_mask2);
6176               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6177               vect[1] = data_ref;
6178
6179               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6180               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181                                                vect[0], vect[1], shift1_mask);
6182               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6183               (*result_chain)[j/2 + length/2] = data_ref;
6184
6185               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6186               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187                                                vect[0], vect[1], select_mask);
6188               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6189               (*result_chain)[j/2] = data_ref;
6190             }
6191           memcpy (dr_chain.address (), result_chain->address (),
6192                   length * sizeof (tree));
6193         }
6194       return true;
6195     }
6196   if (length == 3 && vf > 2)
6197     {
6198       unsigned int k = 0, l = 0;
6199
6200       /* Generating permutation constant to get all elements in rigth order.
6201          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6202       for (i = 0; i < nelt; i++)
6203         {
6204           if (3 * k + (l % 3) >= nelt)
6205             {
6206               k = 0;
6207               l += (3 - (nelt % 3));
6208             }
6209           sel[i] = 3 * k + (l % 3);
6210           k++;
6211         }
6212       vec_perm_indices indices (sel, 2, nelt);
6213       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6214         {
6215           if (dump_enabled_p ())
6216             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6217                              "shuffle of 3 fields structure is not \
6218                               supported by target\n");
6219           return false;
6220         }
6221       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6222
6223       /* Generating permutation constant to shift all elements.
6224          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6225       for (i = 0; i < nelt; i++)
6226         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6227       indices.new_vector (sel, 2, nelt);
6228       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6229         {
6230           if (dump_enabled_p ())
6231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232                              "shift permutation is not supported by target\n");
6233           return false;
6234         }
6235       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6236
6237       /* Generating permutation constant to shift all elements.
6238          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6239       for (i = 0; i < nelt; i++)
6240         sel[i] = 2 * (nelt / 3) + 1 + i;
6241       indices.new_vector (sel, 2, nelt);
6242       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6243         {
6244           if (dump_enabled_p ())
6245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6246                              "shift permutation is not supported by target\n");
6247           return false;
6248         }
6249       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6250
6251       /* Generating permutation constant to shift all elements.
6252          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6253       for (i = 0; i < nelt; i++)
6254         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6255       indices.new_vector (sel, 2, nelt);
6256       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257         {
6258           if (dump_enabled_p ())
6259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260                              "shift permutation is not supported by target\n");
6261           return false;
6262         }
6263       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6264
6265       /* Generating permutation constant to shift all elements.
6266          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6267       for (i = 0; i < nelt; i++)
6268         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6269       indices.new_vector (sel, 2, nelt);
6270       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6271         {
6272           if (dump_enabled_p ())
6273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6274                              "shift permutation is not supported by target\n");
6275           return false;
6276         }
6277       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6278
6279       for (k = 0; k < 3; k++)
6280         {
6281           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6282           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6283                                            dr_chain[k], dr_chain[k],
6284                                            perm3_mask);
6285           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6286           vect[k] = data_ref;
6287         }
6288
6289       for (k = 0; k < 3; k++)
6290         {
6291           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6292           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6293                                            vect[k % 3], vect[(k + 1) % 3],
6294                                            shift1_mask);
6295           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6296           vect_shift[k] = data_ref;
6297         }
6298
6299       for (k = 0; k < 3; k++)
6300         {
6301           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6302           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303                                            vect_shift[(4 - k) % 3],
6304                                            vect_shift[(3 - k) % 3],
6305                                            shift2_mask);
6306           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6307           vect[k] = data_ref;
6308         }
6309
6310       (*result_chain)[3 - (nelt % 3)] = vect[2];
6311
6312       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6313       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6314                                        vect[0], shift3_mask);
6315       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316       (*result_chain)[nelt % 3] = data_ref;
6317
6318       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6319       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6320                                        vect[1], shift4_mask);
6321       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6322       (*result_chain)[0] = data_ref;
6323       return true;
6324     }
6325   return false;
6326 }
6327
6328 /* Function vect_transform_grouped_load.
6329
6330    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6331    to perform their permutation and ascribe the result vectorized statements to
6332    the scalar statements.
6333 */
6334
6335 void
6336 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6337                              gimple_stmt_iterator *gsi)
6338 {
6339   machine_mode mode;
6340   vec<tree> result_chain = vNULL;
6341
6342   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6343      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6344      vectors, that are ready for vector computation.  */
6345   result_chain.create (size);
6346
6347   /* If reassociation width for vector type is 2 or greater target machine can
6348      execute 2 or more vector instructions in parallel.  Otherwise try to
6349      get chain for loads group using vect_shift_permute_load_chain.  */
6350   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6351   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6352       || pow2p_hwi (size)
6353       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6354                                          gsi, &result_chain))
6355     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6356   vect_record_grouped_load_vectors (stmt, result_chain);
6357   result_chain.release ();
6358 }
6359
6360 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6361    generated as part of the vectorization of STMT.  Assign the statement
6362    for each vector to the associated scalar statement.  */
6363
6364 void
6365 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6366 {
6367   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6368   gimple *next_stmt, *new_stmt;
6369   unsigned int i, gap_count;
6370   tree tmp_data_ref;
6371
6372   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6373      Since we scan the chain starting from it's first node, their order
6374      corresponds the order of data-refs in RESULT_CHAIN.  */
6375   next_stmt = first_stmt;
6376   gap_count = 1;
6377   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6378     {
6379       if (!next_stmt)
6380         break;
6381
6382       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6383        code elimination pass later.  No need to check for the first stmt in
6384        the group, since it always exists.
6385        GROUP_GAP is the number of steps in elements from the previous
6386        access (if there is no gap GROUP_GAP is 1).  We skip loads that
6387        correspond to the gaps.  */
6388       if (next_stmt != first_stmt
6389           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6390       {
6391         gap_count++;
6392         continue;
6393       }
6394
6395       while (next_stmt)
6396         {
6397           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6398           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6399              copies, and we put the new vector statement in the first available
6400              RELATED_STMT.  */
6401           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6402             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6403           else
6404             {
6405               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6406                 {
6407                   gimple *prev_stmt =
6408                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6409                   gimple *rel_stmt =
6410                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6411                   while (rel_stmt)
6412                     {
6413                       prev_stmt = rel_stmt;
6414                       rel_stmt =
6415                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6416                     }
6417
6418                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6419                     new_stmt;
6420                 }
6421             }
6422
6423           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6424           gap_count = 1;
6425           /* If NEXT_STMT accesses the same DR as the previous statement,
6426              put the same TMP_DATA_REF as its vectorized statement; otherwise
6427              get the next data-ref from RESULT_CHAIN.  */
6428           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6429             break;
6430         }
6431     }
6432 }
6433
6434 /* Function vect_force_dr_alignment_p.
6435
6436    Returns whether the alignment of a DECL can be forced to be aligned
6437    on ALIGNMENT bit boundary.  */
6438
6439 bool
6440 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6441 {
6442   if (!VAR_P (decl))
6443     return false;
6444
6445   if (decl_in_symtab_p (decl)
6446       && !symtab_node::get (decl)->can_increase_alignment_p ())
6447     return false;
6448
6449   if (TREE_STATIC (decl))
6450     return (alignment <= MAX_OFILE_ALIGNMENT);
6451   else
6452     return (alignment <= MAX_STACK_ALIGNMENT);
6453 }
6454
6455
6456 /* Return whether the data reference DR is supported with respect to its
6457    alignment.
6458    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6459    it is aligned, i.e., check if it is possible to vectorize it with different
6460    alignment.  */
6461
6462 enum dr_alignment_support
6463 vect_supportable_dr_alignment (struct data_reference *dr,
6464                                bool check_aligned_accesses)
6465 {
6466   gimple *stmt = DR_STMT (dr);
6467   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6468   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6469   machine_mode mode = TYPE_MODE (vectype);
6470   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6471   struct loop *vect_loop = NULL;
6472   bool nested_in_vect_loop = false;
6473
6474   if (aligned_access_p (dr) && !check_aligned_accesses)
6475     return dr_aligned;
6476
6477   /* For now assume all conditional loads/stores support unaligned
6478      access without any special code.  */
6479   if (is_gimple_call (stmt)
6480       && gimple_call_internal_p (stmt)
6481       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6482           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6483     return dr_unaligned_supported;
6484
6485   if (loop_vinfo)
6486     {
6487       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6488       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6489     }
6490
6491   /* Possibly unaligned access.  */
6492
6493   /* We can choose between using the implicit realignment scheme (generating
6494      a misaligned_move stmt) and the explicit realignment scheme (generating
6495      aligned loads with a REALIGN_LOAD).  There are two variants to the
6496      explicit realignment scheme: optimized, and unoptimized.
6497      We can optimize the realignment only if the step between consecutive
6498      vector loads is equal to the vector size.  Since the vector memory
6499      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6500      is guaranteed that the misalignment amount remains the same throughout the
6501      execution of the vectorized loop.  Therefore, we can create the
6502      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6503      at the loop preheader.
6504
6505      However, in the case of outer-loop vectorization, when vectorizing a
6506      memory access in the inner-loop nested within the LOOP that is now being
6507      vectorized, while it is guaranteed that the misalignment of the
6508      vectorized memory access will remain the same in different outer-loop
6509      iterations, it is *not* guaranteed that is will remain the same throughout
6510      the execution of the inner-loop.  This is because the inner-loop advances
6511      with the original scalar step (and not in steps of VS).  If the inner-loop
6512      step happens to be a multiple of VS, then the misalignment remains fixed
6513      and we can use the optimized realignment scheme.  For example:
6514
6515       for (i=0; i<N; i++)
6516         for (j=0; j<M; j++)
6517           s += a[i+j];
6518
6519      When vectorizing the i-loop in the above example, the step between
6520      consecutive vector loads is 1, and so the misalignment does not remain
6521      fixed across the execution of the inner-loop, and the realignment cannot
6522      be optimized (as illustrated in the following pseudo vectorized loop):
6523
6524       for (i=0; i<N; i+=4)
6525         for (j=0; j<M; j++){
6526           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6527                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6528                          // (assuming that we start from an aligned address).
6529           }
6530
6531      We therefore have to use the unoptimized realignment scheme:
6532
6533       for (i=0; i<N; i+=4)
6534           for (j=k; j<M; j+=4)
6535           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6536                            // that the misalignment of the initial address is
6537                            // 0).
6538
6539      The loop can then be vectorized as follows:
6540
6541       for (k=0; k<4; k++){
6542         rt = get_realignment_token (&vp[k]);
6543         for (i=0; i<N; i+=4){
6544           v1 = vp[i+k];
6545           for (j=k; j<M; j+=4){
6546             v2 = vp[i+j+VS-1];
6547             va = REALIGN_LOAD <v1,v2,rt>;
6548             vs += va;
6549             v1 = v2;
6550           }
6551         }
6552     } */
6553
6554   if (DR_IS_READ (dr))
6555     {
6556       bool is_packed = false;
6557       tree type = (TREE_TYPE (DR_REF (dr)));
6558
6559       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6560           && (!targetm.vectorize.builtin_mask_for_load
6561               || targetm.vectorize.builtin_mask_for_load ()))
6562         {
6563           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6564
6565           /* If we are doing SLP then the accesses need not have the
6566              same alignment, instead it depends on the SLP group size.  */
6567           if (loop_vinfo
6568               && STMT_SLP_TYPE (stmt_info)
6569               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6570                               * GROUP_SIZE (vinfo_for_stmt
6571                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6572                               TYPE_VECTOR_SUBPARTS (vectype)))
6573             ;
6574           else if (!loop_vinfo
6575                    || (nested_in_vect_loop
6576                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6577                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6578             return dr_explicit_realign;
6579           else
6580             return dr_explicit_realign_optimized;
6581         }
6582       if (!known_alignment_for_access_p (dr))
6583         is_packed = not_size_aligned (DR_REF (dr));
6584
6585       if (targetm.vectorize.support_vector_misalignment
6586             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6587         /* Can't software pipeline the loads, but can at least do them.  */
6588         return dr_unaligned_supported;
6589     }
6590   else
6591     {
6592       bool is_packed = false;
6593       tree type = (TREE_TYPE (DR_REF (dr)));
6594
6595       if (!known_alignment_for_access_p (dr))
6596         is_packed = not_size_aligned (DR_REF (dr));
6597
6598      if (targetm.vectorize.support_vector_misalignment
6599            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6600        return dr_unaligned_supported;
6601     }
6602
6603   /* Unsupported.  */
6604   return dr_unaligned_unsupported;
6605 }