gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT_INFO.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 121                                HOST_WIDE_INT *lhs_size_unit,
 122                                HOST_WIDE_INT *rhs_size_unit)
 123 {
 124   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 125   HOST_WIDE_INT lhs, rhs;
 126
 127   /* During the analysis phase, this function is called on arbitrary
 128      statements that might not have scalar results.  */
 129   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 130     return scalar_type;
 131
 132   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 133
 134   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 135   if (assign
 136       && (gimple_assign_cast_p (assign)
 137           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 142     {
 143       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146       if (rhs < lhs)
 147         scalar_type = rhs_type;
 148     }
 149
 150   *lhs_size_unit = lhs;
 151   *rhs_size_unit = rhs;
 152   return scalar_type;
 153 }
 154
 155
 156 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 157    tested at run-time.  Return TRUE if DDR was successfully inserted.
 158    Return false if versioning is not supported.  */
 159
 160 static bool
 161 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 162 {
 163   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 164
 165   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 166     return false;
 167
 168   if (!runtime_alias_check_p (ddr, loop,
 169                               optimize_loop_nest_for_speed_p (loop)))
 170     return false;
 171
 172   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 173   return true;
 174 }
 175
 176 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 177
 178 static void
 179 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 180 {
 181   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 182   for (unsigned int i = 0; i < checks.length(); ++i)
 183     if (checks[i] == value)
 184       return;
 185
 186   if (dump_enabled_p ())
 187     {
 188       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 189       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 190       dump_printf (MSG_NOTE, " is nonzero\n");
 191     }
 192   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 193 }
 194
 195 /* Return true if we know that the order of vectorized DR_INFO_A and
 196    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 197    DR_INFO_B.  At least one of the accesses is a write.  */
 198
 199 static bool
 200 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 201 {
 202   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 203   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 204
 205   /* Single statements are always kept in their original order.  */
 206   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 207       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 208     return true;
 209
 210   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 211      group are emitted at the position of the first scalar load and all
 212      stores in a group are emitted at the position of the last scalar store.
 213      Thus writes will happen no earlier than their current position
 214      (but could happen later) while reads will happen no later than their
 215      current position (but could happen earlier).  Reordering is therefore
 216      only possible if the first access is a write.  */
 217   stmtinfo_a = vect_orig_stmt (stmtinfo_a);
 218   stmtinfo_b = vect_orig_stmt (stmtinfo_b);
 219   stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 271           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 272           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 273                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 298   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 299   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 300   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 301   lambda_vector dist_v;
 302   unsigned int loop_depth;
 303
 304   /* In loop analysis all data references should be vectorizable.  */
 305   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 306       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 307     gcc_unreachable ();
 308
 309   /* Independent data accesses.  */
 310   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 311     return false;
 312
 313   if (dra == drb
 314       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 315     return false;
 316
 317   /* We do not have to consider dependences between accesses that belong
 318      to the same group, unless the stride could be smaller than the
 319      group size.  */
 320   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 321       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 322           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 323       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 324     return false;
 325
 326   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 327      least two scalar iterations, there is always also a true dependence.
 328      As the vectorizer does not re-order loads and stores we can ignore
 329      the anti-dependence if TBAA can disambiguate both DRs similar to the
 330      case with known negative distance anti-dependences (positive
 331      distance anti-dependences would violate TBAA constraints).  */
 332   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 333        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 334       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 335                                  get_alias_set (DR_REF (drb))))
 336     return false;
 337
 338   /* Unknown data dependence.  */
 339   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 340     {
 341       /* If user asserted safelen consecutive iterations can be
 342          executed concurrently, assume independence.  */
 343       if (loop->safelen >= 2)
 344         {
 345           if ((unsigned int) loop->safelen < *max_vf)
 346             *max_vf = loop->safelen;
 347           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 348           return false;
 349         }
 350
 351       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 352           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 353         {
 354           if (dump_enabled_p ())
 355             {
 356               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 357                                "versioning for alias not supported for: "
 358                                "can't determine dependence between ");
 359               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 360                                  DR_REF (dra));
 361               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 362               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 363                                  DR_REF (drb));
 364               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 365             }
 366           return true;
 367         }
 368
 369       if (dump_enabled_p ())
 370         {
 371           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 372                            "versioning for alias required: "
 373                            "can't determine dependence between ");
 374           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 375                              DR_REF (dra));
 376           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 377           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 378                              DR_REF (drb));
 379           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 380         }
 381
 382       /* Add to list of ddrs that need to be tested at run-time.  */
 383       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 384     }
 385
 386   /* Known data dependence.  */
 387   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 388     {
 389       /* If user asserted safelen consecutive iterations can be
 390          executed concurrently, assume independence.  */
 391       if (loop->safelen >= 2)
 392         {
 393           if ((unsigned int) loop->safelen < *max_vf)
 394             *max_vf = loop->safelen;
 395           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 396           return false;
 397         }
 398
 399       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 400           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 401         {
 402           if (dump_enabled_p ())
 403             {
 404               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                                "versioning for alias not supported for: "
 406                                "bad dist vector for ");
 407               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 408                                  DR_REF (dra));
 409               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 410               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 411                                  DR_REF (drb));
 412               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 413             }
 414           return true;
 415         }
 416
 417       if (dump_enabled_p ())
 418         {
 419           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 420                            "versioning for alias required: "
 421                            "bad dist vector for ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 423           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 424           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 425           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 426         }
 427       /* Add to list of ddrs that need to be tested at run-time.  */
 428       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 429     }
 430
 431   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 432
 433   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 434       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 435                                                 loop_depth, max_vf))
 436     return false;
 437
 438   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 439     {
 440       int dist = dist_v[loop_depth];
 441
 442       if (dump_enabled_p ())
 443         dump_printf_loc (MSG_NOTE, vect_location,
 444                          "dependence distance  = %d.\n", dist);
 445
 446       if (dist == 0)
 447         {
 448           if (dump_enabled_p ())
 449             {
 450               dump_printf_loc (MSG_NOTE, vect_location,
 451                                "dependence distance == 0 between ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 453               dump_printf (MSG_NOTE, " and ");
 454               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 455               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 456             }
 457
 458           /* When we perform grouped accesses and perform implicit CSE
 459              by detecting equal accesses and doing disambiguation with
 460              runtime alias tests like for
 461                 .. = a[i];
 462                 .. = a[i+1];
 463                 a[i] = ..;
 464                 a[i+1] = ..;
 465                 *p = ..;
 466                 .. = a[i];
 467                 .. = a[i+1];
 468              where we will end up loading { a[i], a[i+1] } once, make
 469              sure that inserting group loads before the first load and
 470              stores after the last store will do the right thing.
 471              Similar for groups like
 472                 a[i] = ...;
 473                 ... = a[i];
 474                 a[i+1] = ...;
 475              where loads from the group interleave with the store.  */
 476           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 477             {
 478               if (dump_enabled_p ())
 479                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 480                                  "READ_WRITE dependence in interleaving.\n");
 481               return true;
 482             }
 483
 484           if (loop->safelen < 2)
 485             {
 486               tree indicator = dr_zero_step_indicator (dra);
 487               if (!indicator || integer_zerop (indicator))
 488                 {
 489                   if (dump_enabled_p ())
 490                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 491                                  "access also has a zero step\n");
 492                   return true;
 493                 }
 494               else if (TREE_CODE (indicator) != INTEGER_CST)
 495                 vect_check_nonzero_value (loop_vinfo, indicator);
 496             }
 497           continue;
 498         }
 499
 500       if (dist > 0 && DDR_REVERSED_P (ddr))
 501         {
 502           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 503              reversed (to make distance vector positive), and the actual
 504              distance is negative.  */
 505           if (dump_enabled_p ())
 506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 507                              "dependence distance negative.\n");
 508           /* Record a negative dependence distance to later limit the
 509              amount of stmt copying / unrolling we can perform.
 510              Only need to handle read-after-write dependence.  */
 511           if (DR_IS_READ (drb)
 512               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 513                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 514             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 515           continue;
 516         }
 517
 518       unsigned int abs_dist = abs (dist);
 519       if (abs_dist >= 2 && abs_dist < *max_vf)
 520         {
 521           /* The dependence distance requires reduction of the maximal
 522              vectorization factor.  */
 523           *max_vf = abs (dist);
 524           if (dump_enabled_p ())
 525             dump_printf_loc (MSG_NOTE, vect_location,
 526                              "adjusting maximal vectorization factor to %i\n",
 527                              *max_vf);
 528         }
 529
 530       if (abs_dist >= *max_vf)
 531         {
 532           /* Dependence distance does not create dependence, as far as
 533              vectorization is concerned, in this case.  */
 534           if (dump_enabled_p ())
 535             dump_printf_loc (MSG_NOTE, vect_location,
 536                              "dependence distance >= VF.\n");
 537           continue;
 538         }
 539
 540       if (dump_enabled_p ())
 541         {
 542           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 543                        "not vectorized, possible dependence "
 544                        "between data-refs ");
 545           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 546           dump_printf (MSG_NOTE,  " and ");
 547           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 548           dump_printf (MSG_NOTE,  "\n");
 549         }
 550
 551       return true;
 552     }
 553
 554   return false;
 555 }
 556
 557 /* Function vect_analyze_data_ref_dependences.
 558
 559    Examine all the data references in the loop, and make sure there do not
 560    exist any data dependences between them.  Set *MAX_VF according to
 561    the maximum vectorization factor the data dependences allow.  */
 562
 563 bool
 564 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 565                                    unsigned int *max_vf)
 566 {
 567   unsigned int i;
 568   struct data_dependence_relation *ddr;
 569
 570   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 571
 572   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 573     {
 574       LOOP_VINFO_DDRS (loop_vinfo)
 575         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 576                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 577       /* We need read-read dependences to compute
 578          STMT_VINFO_SAME_ALIGN_REFS.  */
 579       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 580                                           &LOOP_VINFO_DDRS (loop_vinfo),
 581                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 582                                           true);
 583       gcc_assert (res);
 584     }
 585
 586   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 587
 588   /* For epilogues we either have no aliases or alias versioning
 589      was applied to original loop.  Therefore we may just get max_vf
 590      using VF of original loop.  */
 591   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 592     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 593   else
 594     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 595       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 596         return false;
 597
 598   return true;
 599 }
 600
 601
 602 /* Function vect_slp_analyze_data_ref_dependence.
 603
 604    Return TRUE if there (might) exist a dependence between a memory-reference
 605    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 606    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 607    according to the data dependence.  */
 608
 609 static bool
 610 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 611                                       struct data_dependence_relation *ddr)
 612 {
 613   struct data_reference *dra = DDR_A (ddr);
 614   struct data_reference *drb = DDR_B (ddr);
 615   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 616   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 617
 618   /* We need to check dependences of statements marked as unvectorizable
 619      as well, they still can prohibit vectorization.  */
 620
 621   /* Independent data accesses.  */
 622   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 623     return false;
 624
 625   if (dra == drb)
 626     return false;
 627
 628   /* Read-read is OK.  */
 629   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 630     return false;
 631
 632   /* If dra and drb are part of the same interleaving chain consider
 633      them independent.  */
 634   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 635       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 636           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 637     return false;
 638
 639   /* Unknown data dependence.  */
 640   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 641     {
 642       if  (dump_enabled_p ())
 643         {
 644           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 645                            "can't determine dependence between ");
 646           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 647           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 648           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 649           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 650         }
 651     }
 652   else if (dump_enabled_p ())
 653     {
 654       dump_printf_loc (MSG_NOTE, vect_location,
 655                        "determined dependence between ");
 656       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 657       dump_printf (MSG_NOTE, " and ");
 658       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 659       dump_printf (MSG_NOTE,  "\n");
 660     }
 661
 662   return true;
 663 }
 664
 665
 666 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 667    contain the vector of scalar stores of this instance if we are
 668    disambiguating the loads.  */
 669
 670 static bool
 671 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 672                                    vec<stmt_vec_info> stores,
 673                                    stmt_vec_info last_store_info)
 674 {
 675   /* This walks over all stmts involved in the SLP load/store done
 676      in NODE verifying we can sink them up to the last stmt in the
 677      group.  */
 678   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 679   vec_info *vinfo = last_access_info->vinfo;
 680   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 681     {
 682       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 683       if (access_info == last_access_info)
 684         continue;
 685       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 686       ao_ref ref;
 687       bool ref_initialized_p = false;
 688       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 689            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 690         {
 691           gimple *stmt = gsi_stmt (gsi);
 692           if (! gimple_vuse (stmt)
 693               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 694             continue;
 695
 696           /* If we couldn't record a (single) data reference for this
 697              stmt we have to resort to the alias oracle.  */
 698           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 699           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 700           if (!dr_b)
 701             {
 702               /* We are moving a store or sinking a load - this means
 703                  we cannot use TBAA for disambiguation.  */
 704               if (!ref_initialized_p)
 705                 ao_ref_init (&ref, DR_REF (dr_a));
 706               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 707                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 708                 return false;
 709               continue;
 710             }
 711
 712           bool dependent = false;
 713           /* If we run into a store of this same instance (we've just
 714              marked those) then delay dependence checking until we run
 715              into the last store because this is where it will have
 716              been sunk to (and we verify if we can do that as well).  */
 717           if (gimple_visited_p (stmt))
 718             {
 719               if (stmt_info != last_store_info)
 720                 continue;
 721               unsigned i;
 722               stmt_vec_info store_info;
 723               FOR_EACH_VEC_ELT (stores, i, store_info)
 724                 {
 725                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 726                   ddr_p ddr = initialize_data_dependence_relation
 727                                 (dr_a, store_dr, vNULL);
 728                   dependent
 729                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 730                   free_dependence_relation (ddr);
 731                   if (dependent)
 732                     break;
 733                 }
 734             }
 735           else
 736             {
 737               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 738                                                                dr_b, vNULL);
 739               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 740               free_dependence_relation (ddr);
 741             }
 742           if (dependent)
 743             return false;
 744         }
 745     }
 746   return true;
 747 }
 748
 749
 750 /* Function vect_analyze_data_ref_dependences.
 751
 752    Examine all the data references in the basic-block, and make sure there
 753    do not exist any data dependences between them.  Set *MAX_VF according to
 754    the maximum vectorization factor the data dependences allow.  */
 755
 756 bool
 757 vect_slp_analyze_instance_dependence (slp_instance instance)
 758 {
 759   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 760
 761   /* The stores of this instance are at the root of the SLP tree.  */
 762   slp_tree store = SLP_INSTANCE_TREE (instance);
 763   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 764     store = NULL;
 765
 766   /* Verify we can sink stores to the vectorized stmt insert location.  */
 767   stmt_vec_info last_store_info = NULL;
 768   if (store)
 769     {
 770       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 771         return false;
 772
 773       /* Mark stores in this instance and remember the last one.  */
 774       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 775       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 776         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 777     }
 778
 779   bool res = true;
 780
 781   /* Verify we can sink loads to the vectorized stmt insert location,
 782      special-casing stores of this instance.  */
 783   slp_tree load;
 784   unsigned int i;
 785   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 786     if (! vect_slp_analyze_node_dependences (instance, load,
 787                                              store
 788                                              ? SLP_TREE_SCALAR_STMTS (store)
 789                                              : vNULL, last_store_info))
 790       {
 791         res = false;
 792         break;
 793       }
 794
 795   /* Unset the visited flag.  */
 796   if (store)
 797     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 798       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 799
 800   return res;
 801 }
 802
 803 /* Record the base alignment guarantee given by DRB, which occurs
 804    in STMT_INFO.  */
 805
 806 static void
 807 vect_record_base_alignment (stmt_vec_info stmt_info,
 808                             innermost_loop_behavior *drb)
 809 {
 810   vec_info *vinfo = stmt_info->vinfo;
 811   bool existed;
 812   innermost_loop_behavior *&entry
 813     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 814   if (!existed || entry->base_alignment < drb->base_alignment)
 815     {
 816       entry = drb;
 817       if (dump_enabled_p ())
 818         {
 819           dump_printf_loc (MSG_NOTE, vect_location,
 820                            "recording new base alignment for ");
 821           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 822           dump_printf (MSG_NOTE, "\n");
 823           dump_printf_loc (MSG_NOTE, vect_location,
 824                            "  alignment:    %d\n", drb->base_alignment);
 825           dump_printf_loc (MSG_NOTE, vect_location,
 826                            "  misalignment: %d\n", drb->base_misalignment);
 827           dump_printf_loc (MSG_NOTE, vect_location,
 828                            "  based on:     ");
 829           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 830         }
 831     }
 832 }
 833
 834 /* If the region we're going to vectorize is reached, all unconditional
 835    data references occur at least once.  We can therefore pool the base
 836    alignment guarantees from each unconditional reference.  Do this by
 837    going through all the data references in VINFO and checking whether
 838    the containing statement makes the reference unconditionally.  If so,
 839    record the alignment of the base address in VINFO so that it can be
 840    used for all other references with the same base.  */
 841
 842 void
 843 vect_record_base_alignments (vec_info *vinfo)
 844 {
 845   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 846   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 847   data_reference *dr;
 848   unsigned int i;
 849   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 850     {
 851       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 852       stmt_vec_info stmt_info = dr_info->stmt;
 853       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 854           && STMT_VINFO_VECTORIZABLE (stmt_info)
 855           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 856         {
 857           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 858
 859           /* If DR is nested in the loop that is being vectorized, we can also
 860              record the alignment of the base wrt the outer loop.  */
 861           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 862             vect_record_base_alignment
 863               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 864         }
 865     }
 866 }
 867
 868 /* Return the target alignment for the vectorized form of DR_INFO.  */
 869
 870 static unsigned int
 871 vect_calculate_target_alignment (dr_vec_info *dr_info)
 872 {
 873   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 874   return targetm.vectorize.preferred_vector_alignment (vectype);
 875 }
 876
 877 /* Function vect_compute_data_ref_alignment
 878
 879    Compute the misalignment of the data reference DR_INFO.
 880
 881    Output:
 882    1. DR_MISALIGNMENT (DR_INFO) is defined.
 883
 884    FOR NOW: No analysis is actually performed. Misalignment is calculated
 885    only for trivial cases. TODO.  */
 886
 887 static void
 888 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 889 {
 890   stmt_vec_info stmt_info = dr_info->stmt;
 891   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 892   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 893   struct loop *loop = NULL;
 894   tree ref = DR_REF (dr_info->dr);
 895   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 896
 897   if (dump_enabled_p ())
 898     dump_printf_loc (MSG_NOTE, vect_location,
 899                      "vect_compute_data_ref_alignment:\n");
 900
 901   if (loop_vinfo)
 902     loop = LOOP_VINFO_LOOP (loop_vinfo);
 903
 904   /* Initialize misalignment to unknown.  */
 905   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 906
 907   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 908     return;
 909
 910   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 911   bool step_preserves_misalignment_p;
 912
 913   unsigned HOST_WIDE_INT vector_alignment
 914     = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
 915   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 916
 917   /* No step for BB vectorization.  */
 918   if (!loop)
 919     {
 920       gcc_assert (integer_zerop (drb->step));
 921       step_preserves_misalignment_p = true;
 922     }
 923
 924   /* In case the dataref is in an inner-loop of the loop that is being
 925      vectorized (LOOP), we use the base and misalignment information
 926      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 927      stays the same throughout the execution of the inner-loop, which is why
 928      we have to check that the stride of the dataref in the inner-loop evenly
 929      divides by the vector alignment.  */
 930   else if (nested_in_vect_loop_p (loop, stmt_info))
 931     {
 932       step_preserves_misalignment_p
 933         = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
 934
 935       if (dump_enabled_p ())
 936         {
 937           if (step_preserves_misalignment_p)
 938             dump_printf_loc (MSG_NOTE, vect_location,
 939                              "inner step divides the vector alignment.\n");
 940           else
 941             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 942                              "inner step doesn't divide the vector"
 943                              " alignment.\n");
 944         }
 945     }
 946
 947   /* Similarly we can only use base and misalignment information relative to
 948      an innermost loop if the misalignment stays the same throughout the
 949      execution of the loop.  As above, this is the case if the stride of
 950      the dataref evenly divides by the alignment.  */
 951   else
 952     {
 953       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 954       step_preserves_misalignment_p
 955         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
 956
 957       if (!step_preserves_misalignment_p && dump_enabled_p ())
 958         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 959                          "step doesn't divide the vector alignment.\n");
 960     }
 961
 962   unsigned int base_alignment = drb->base_alignment;
 963   unsigned int base_misalignment = drb->base_misalignment;
 964
 965   /* Calculate the maximum of the pooled base address alignment and the
 966      alignment that we can compute for DR itself.  */
 967   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 968   if (entry && base_alignment < (*entry)->base_alignment)
 969     {
 970       base_alignment = (*entry)->base_alignment;
 971       base_misalignment = (*entry)->base_misalignment;
 972     }
 973
 974   if (drb->offset_alignment < vector_alignment
 975       || !step_preserves_misalignment_p
 976       /* We need to know whether the step wrt the vectorized loop is
 977          negative when computing the starting misalignment below.  */
 978       || TREE_CODE (drb->step) != INTEGER_CST)
 979     {
 980       if (dump_enabled_p ())
 981         {
 982           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 983                            "Unknown alignment for access: ");
 984           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 985           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 986         }
 987       return;
 988     }
 989
 990   if (base_alignment < vector_alignment)
 991     {
 992       unsigned int max_alignment;
 993       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 994       if (max_alignment < vector_alignment
 995           || !vect_can_force_dr_alignment_p (base,
 996                                              vector_alignment * BITS_PER_UNIT))
 997         {
 998           if (dump_enabled_p ())
 999             {
1000               dump_printf_loc (MSG_NOTE, vect_location,
1001                                "can't force alignment of ref: ");
1002               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1003               dump_printf (MSG_NOTE, "\n");
1004             }
1005           return;
1006         }
1007
1008       /* Force the alignment of the decl.
1009          NOTE: This is the only change to the code we make during
1010          the analysis phase, before deciding to vectorize the loop.  */
1011       if (dump_enabled_p ())
1012         {
1013           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1014           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1015           dump_printf (MSG_NOTE, "\n");
1016         }
1017
1018       dr_info->base_decl = base;
1019       dr_info->base_misaligned = true;
1020       base_misalignment = 0;
1021     }
1022   poly_int64 misalignment
1023     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1024
1025   /* If this is a backward running DR then first access in the larger
1026      vectype actually is N-1 elements before the address in the DR.
1027      Adjust misalign accordingly.  */
1028   if (tree_int_cst_sgn (drb->step) < 0)
1029     /* PLUS because STEP is negative.  */
1030     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1031                      * TREE_INT_CST_LOW (drb->step));
1032
1033   unsigned int const_misalignment;
1034   if (!known_misalignment (misalignment, vector_alignment,
1035                            &const_misalignment))
1036     {
1037       if (dump_enabled_p ())
1038         {
1039           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1040                            "Non-constant misalignment for access: ");
1041           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1042           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1043         }
1044       return;
1045     }
1046
1047   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1048
1049   if (dump_enabled_p ())
1050     {
1051       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1052                        "misalign = %d bytes of ref ",
1053                        DR_MISALIGNMENT (dr_info));
1054       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1055       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1056     }
1057
1058   return;
1059 }
1060
1061 /* Function vect_update_misalignment_for_peel.
1062    Sets DR_INFO's misalignment
1063    - to 0 if it has the same alignment as DR_PEEL_INFO,
1064    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1065    - to -1 (unknown) otherwise.
1066
1067    DR_INFO - the data reference whose misalignment is to be adjusted.
1068    DR_PEEL_INFO - the data reference whose misalignment is being made
1069                   zero in the vector loop by the peel.
1070    NPEEL - the number of iterations in the peel loop if the misalignment
1071            of DR_PEEL_INFO is known at compile time.  */
1072
1073 static void
1074 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1075                                    dr_vec_info *dr_peel_info, int npeel)
1076 {
1077   unsigned int i;
1078   vec<dr_p> same_aligned_drs;
1079   struct data_reference *current_dr;
1080   int dr_size = vect_get_scalar_dr_size (dr_info);
1081   int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1082   stmt_vec_info stmt_info = dr_info->stmt;
1083   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1084
1085  /* For interleaved data accesses the step in the loop must be multiplied by
1086      the size of the interleaving group.  */
1087   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1088     dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1089   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1090     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1091
1092   /* It can be assumed that the data refs with the same alignment as dr_peel
1093      are aligned in the vector loop.  */
1094   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1095   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1096     {
1097       if (current_dr != dr_info->dr)
1098         continue;
1099       gcc_assert (!known_alignment_for_access_p (dr_info)
1100                   || !known_alignment_for_access_p (dr_peel_info)
1101                   || (DR_MISALIGNMENT (dr_info) / dr_size
1102                       == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1103       SET_DR_MISALIGNMENT (dr_info, 0);
1104       return;
1105     }
1106
1107   if (known_alignment_for_access_p (dr_info)
1108       && known_alignment_for_access_p (dr_peel_info))
1109     {
1110       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1111                                             size_zero_node) < 0;
1112       int misal = DR_MISALIGNMENT (dr_info);
1113       misal += negative ? -npeel * dr_size : npeel * dr_size;
1114       misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1115       SET_DR_MISALIGNMENT (dr_info, misal);
1116       return;
1117     }
1118
1119   if (dump_enabled_p ())
1120     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1121                      "to unknown (-1).\n");
1122   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1123 }
1124
1125
1126 /* Function verify_data_ref_alignment
1127
1128    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1129
1130 static bool
1131 verify_data_ref_alignment (dr_vec_info *dr_info)
1132 {
1133   enum dr_alignment_support supportable_dr_alignment
1134     = vect_supportable_dr_alignment (dr_info, false);
1135   if (!supportable_dr_alignment)
1136     {
1137       if (dump_enabled_p ())
1138         {
1139           if (DR_IS_READ (dr_info->dr))
1140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                              "not vectorized: unsupported unaligned load.");
1142           else
1143             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1144                              "not vectorized: unsupported unaligned "
1145                              "store.");
1146
1147           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1148                              DR_REF (dr_info->dr));
1149           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1150         }
1151       return false;
1152     }
1153
1154   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1155     dump_printf_loc (MSG_NOTE, vect_location,
1156                      "Vectorizing an unaligned access.\n");
1157
1158   return true;
1159 }
1160
1161 /* Function vect_verify_datarefs_alignment
1162
1163    Return TRUE if all data references in the loop can be
1164    handled with respect to alignment.  */
1165
1166 bool
1167 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1168 {
1169   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1170   struct data_reference *dr;
1171   unsigned int i;
1172
1173   FOR_EACH_VEC_ELT (datarefs, i, dr)
1174     {
1175       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1176       stmt_vec_info stmt_info = dr_info->stmt;
1177
1178       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1179         continue;
1180
1181       /* For interleaving, only the alignment of the first access matters.   */
1182       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1183           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1184         continue;
1185
1186       /* Strided accesses perform only component accesses, alignment is
1187          irrelevant for them.  */
1188       if (STMT_VINFO_STRIDED_P (stmt_info)
1189           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1190         continue;
1191
1192       if (! verify_data_ref_alignment (dr_info))
1193         return false;
1194     }
1195
1196   return true;
1197 }
1198
1199 /* Given an memory reference EXP return whether its alignment is less
1200    than its size.  */
1201
1202 static bool
1203 not_size_aligned (tree exp)
1204 {
1205   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1206     return true;
1207
1208   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1209           > get_object_alignment (exp));
1210 }
1211
1212 /* Function vector_alignment_reachable_p
1213
1214    Return true if vector alignment for DR_INFO is reachable by peeling
1215    a few loop iterations.  Return false otherwise.  */
1216
1217 static bool
1218 vector_alignment_reachable_p (dr_vec_info *dr_info)
1219 {
1220   stmt_vec_info stmt_info = dr_info->stmt;
1221   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1222
1223   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1224     {
1225       /* For interleaved access we peel only if number of iterations in
1226          the prolog loop ({VF - misalignment}), is a multiple of the
1227          number of the interleaved accesses.  */
1228       int elem_size, mis_in_elements;
1229
1230       /* FORNOW: handle only known alignment.  */
1231       if (!known_alignment_for_access_p (dr_info))
1232         return false;
1233
1234       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1235       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1236       elem_size = vector_element_size (vector_size, nelements);
1237       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1238
1239       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1240         return false;
1241     }
1242
1243   /* If misalignment is known at the compile time then allow peeling
1244      only if natural alignment is reachable through peeling.  */
1245   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1246     {
1247       HOST_WIDE_INT elmsize =
1248                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1249       if (dump_enabled_p ())
1250         {
1251           dump_printf_loc (MSG_NOTE, vect_location,
1252                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1253           dump_printf (MSG_NOTE,
1254                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr_info));
1255         }
1256       if (DR_MISALIGNMENT (dr_info) % elmsize)
1257         {
1258           if (dump_enabled_p ())
1259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1260                              "data size does not divide the misalignment.\n");
1261           return false;
1262         }
1263     }
1264
1265   if (!known_alignment_for_access_p (dr_info))
1266     {
1267       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1268       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1269       if (dump_enabled_p ())
1270         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                          "Unknown misalignment, %snaturally aligned\n",
1272                          is_packed ? "not " : "");
1273       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1274     }
1275
1276   return true;
1277 }
1278
1279
1280 /* Calculate the cost of the memory access represented by DR_INFO.  */
1281
1282 static void
1283 vect_get_data_access_cost (dr_vec_info *dr_info,
1284                            unsigned int *inside_cost,
1285                            unsigned int *outside_cost,
1286                            stmt_vector_for_cost *body_cost_vec,
1287                            stmt_vector_for_cost *prologue_cost_vec)
1288 {
1289   stmt_vec_info stmt_info = dr_info->stmt;
1290   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1291   int ncopies;
1292
1293   if (PURE_SLP_STMT (stmt_info))
1294     ncopies = 1;
1295   else
1296     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1297
1298   if (DR_IS_READ (dr_info->dr))
1299     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1300                         prologue_cost_vec, body_cost_vec, false);
1301   else
1302     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1303
1304   if (dump_enabled_p ())
1305     dump_printf_loc (MSG_NOTE, vect_location,
1306                      "vect_get_data_access_cost: inside_cost = %d, "
1307                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1308 }
1309
1310
1311 typedef struct _vect_peel_info
1312 {
1313   dr_vec_info *dr_info;
1314   int npeel;
1315   unsigned int count;
1316 } *vect_peel_info;
1317
1318 typedef struct _vect_peel_extended_info
1319 {
1320   struct _vect_peel_info peel_info;
1321   unsigned int inside_cost;
1322   unsigned int outside_cost;
1323 } *vect_peel_extended_info;
1324
1325
1326 /* Peeling hashtable helpers.  */
1327
1328 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1329 {
1330   static inline hashval_t hash (const _vect_peel_info *);
1331   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1332 };
1333
1334 inline hashval_t
1335 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1336 {
1337   return (hashval_t) peel_info->npeel;
1338 }
1339
1340 inline bool
1341 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1342 {
1343   return (a->npeel == b->npeel);
1344 }
1345
1346
1347 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1348
1349 static void
1350 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1351                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1352                           int npeel)
1353 {
1354   struct _vect_peel_info elem, *slot;
1355   _vect_peel_info **new_slot;
1356   bool supportable_dr_alignment
1357     = vect_supportable_dr_alignment (dr_info, true);
1358
1359   elem.npeel = npeel;
1360   slot = peeling_htab->find (&elem);
1361   if (slot)
1362     slot->count++;
1363   else
1364     {
1365       slot = XNEW (struct _vect_peel_info);
1366       slot->npeel = npeel;
1367       slot->dr_info = dr_info;
1368       slot->count = 1;
1369       new_slot = peeling_htab->find_slot (slot, INSERT);
1370       *new_slot = slot;
1371     }
1372
1373   if (!supportable_dr_alignment
1374       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1375     slot->count += VECT_MAX_COST;
1376 }
1377
1378
1379 /* Traverse peeling hash table to find peeling option that aligns maximum
1380    number of data accesses.  */
1381
1382 int
1383 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1384                                      _vect_peel_extended_info *max)
1385 {
1386   vect_peel_info elem = *slot;
1387
1388   if (elem->count > max->peel_info.count
1389       || (elem->count == max->peel_info.count
1390           && max->peel_info.npeel > elem->npeel))
1391     {
1392       max->peel_info.npeel = elem->npeel;
1393       max->peel_info.count = elem->count;
1394       max->peel_info.dr_info = elem->dr_info;
1395     }
1396
1397   return 1;
1398 }
1399
1400 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1401    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1402    we assume DR0_INFO's misalignment will be zero after peeling.  */
1403
1404 static void
1405 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1406                                 dr_vec_info *dr0_info,
1407                                 unsigned int *inside_cost,
1408                                 unsigned int *outside_cost,
1409                                 stmt_vector_for_cost *body_cost_vec,
1410                                 stmt_vector_for_cost *prologue_cost_vec,
1411                                 unsigned int npeel,
1412                                 bool unknown_misalignment)
1413 {
1414   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1415   unsigned i;
1416   data_reference *dr;
1417
1418   FOR_EACH_VEC_ELT (datarefs, i, dr)
1419     {
1420       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1421       stmt_vec_info stmt_info = dr_info->stmt;
1422       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1423         continue;
1424
1425       /* For interleaving, only the alignment of the first access
1426          matters.  */
1427       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1428           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1429         continue;
1430
1431       /* Strided accesses perform only component accesses, alignment is
1432          irrelevant for them.  */
1433       if (STMT_VINFO_STRIDED_P (stmt_info)
1434           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1435         continue;
1436
1437       int save_misalignment;
1438       save_misalignment = DR_MISALIGNMENT (dr_info);
1439       if (npeel == 0)
1440         ;
1441       else if (unknown_misalignment && dr_info == dr0_info)
1442         SET_DR_MISALIGNMENT (dr_info, 0);
1443       else
1444         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1445       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1446                                  body_cost_vec, prologue_cost_vec);
1447       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1448     }
1449 }
1450
1451 /* Traverse peeling hash table and calculate cost for each peeling option.
1452    Find the one with the lowest cost.  */
1453
1454 int
1455 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1456                                    _vect_peel_extended_info *min)
1457 {
1458   vect_peel_info elem = *slot;
1459   int dummy;
1460   unsigned int inside_cost = 0, outside_cost = 0;
1461   stmt_vec_info stmt_info = elem->dr_info->stmt;
1462   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1463   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1464                        epilogue_cost_vec;
1465
1466   prologue_cost_vec.create (2);
1467   body_cost_vec.create (2);
1468   epilogue_cost_vec.create (2);
1469
1470   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1471                                   &outside_cost, &body_cost_vec,
1472                                   &prologue_cost_vec, elem->npeel, false);
1473
1474   body_cost_vec.release ();
1475
1476   outside_cost += vect_get_known_peeling_cost
1477     (loop_vinfo, elem->npeel, &dummy,
1478      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1479      &prologue_cost_vec, &epilogue_cost_vec);
1480
1481   /* Prologue and epilogue costs are added to the target model later.
1482      These costs depend only on the scalar iteration cost, the
1483      number of peeling iterations finally chosen, and the number of
1484      misaligned statements.  So discard the information found here.  */
1485   prologue_cost_vec.release ();
1486   epilogue_cost_vec.release ();
1487
1488   if (inside_cost < min->inside_cost
1489       || (inside_cost == min->inside_cost
1490           && outside_cost < min->outside_cost))
1491     {
1492       min->inside_cost = inside_cost;
1493       min->outside_cost = outside_cost;
1494       min->peel_info.dr_info = elem->dr_info;
1495       min->peel_info.npeel = elem->npeel;
1496       min->peel_info.count = elem->count;
1497     }
1498
1499   return 1;
1500 }
1501
1502
1503 /* Choose best peeling option by traversing peeling hash table and either
1504    choosing an option with the lowest cost (if cost model is enabled) or the
1505    option that aligns as many accesses as possible.  */
1506
1507 static struct _vect_peel_extended_info
1508 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1509                                        loop_vec_info loop_vinfo)
1510 {
1511    struct _vect_peel_extended_info res;
1512
1513    res.peel_info.dr_info = NULL;
1514
1515    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1516      {
1517        res.inside_cost = INT_MAX;
1518        res.outside_cost = INT_MAX;
1519        peeling_htab->traverse <_vect_peel_extended_info *,
1520                                vect_peeling_hash_get_lowest_cost> (&res);
1521      }
1522    else
1523      {
1524        res.peel_info.count = 0;
1525        peeling_htab->traverse <_vect_peel_extended_info *,
1526                                vect_peeling_hash_get_most_frequent> (&res);
1527        res.inside_cost = 0;
1528        res.outside_cost = 0;
1529      }
1530
1531    return res;
1532 }
1533
1534 /* Return true if the new peeling NPEEL is supported.  */
1535
1536 static bool
1537 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1538                           unsigned npeel)
1539 {
1540   unsigned i;
1541   struct data_reference *dr = NULL;
1542   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1543   enum dr_alignment_support supportable_dr_alignment;
1544
1545   /* Ensure that all data refs can be vectorized after the peel.  */
1546   FOR_EACH_VEC_ELT (datarefs, i, dr)
1547     {
1548       int save_misalignment;
1549
1550       if (dr == dr0_info->dr)
1551         continue;
1552
1553       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1554       stmt_vec_info stmt_info = dr_info->stmt;
1555       /* For interleaving, only the alignment of the first access
1556          matters.  */
1557       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1558           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1559         continue;
1560
1561       /* Strided accesses perform only component accesses, alignment is
1562          irrelevant for them.  */
1563       if (STMT_VINFO_STRIDED_P (stmt_info)
1564           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1565         continue;
1566
1567       save_misalignment = DR_MISALIGNMENT (dr_info);
1568       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1569       supportable_dr_alignment
1570         = vect_supportable_dr_alignment (dr_info, false);
1571       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1572
1573       if (!supportable_dr_alignment)
1574         return false;
1575     }
1576
1577   return true;
1578 }
1579
1580 /* Function vect_enhance_data_refs_alignment
1581
1582    This pass will use loop versioning and loop peeling in order to enhance
1583    the alignment of data references in the loop.
1584
1585    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1586    original loop is to be vectorized.  Any other loops that are created by
1587    the transformations performed in this pass - are not supposed to be
1588    vectorized.  This restriction will be relaxed.
1589
1590    This pass will require a cost model to guide it whether to apply peeling
1591    or versioning or a combination of the two.  For example, the scheme that
1592    intel uses when given a loop with several memory accesses, is as follows:
1593    choose one memory access ('p') which alignment you want to force by doing
1594    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1595    other accesses are not necessarily aligned, or (2) use loop versioning to
1596    generate one loop in which all accesses are aligned, and another loop in
1597    which only 'p' is necessarily aligned.
1598
1599    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1600    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1601    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1602
1603    Devising a cost model is the most critical aspect of this work.  It will
1604    guide us on which access to peel for, whether to use loop versioning, how
1605    many versions to create, etc.  The cost model will probably consist of
1606    generic considerations as well as target specific considerations (on
1607    powerpc for example, misaligned stores are more painful than misaligned
1608    loads).
1609
1610    Here are the general steps involved in alignment enhancements:
1611
1612      -- original loop, before alignment analysis:
1613         for (i=0; i<N; i++){
1614           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1615           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1616         }
1617
1618      -- After vect_compute_data_refs_alignment:
1619         for (i=0; i<N; i++){
1620           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1621           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1622         }
1623
1624      -- Possibility 1: we do loop versioning:
1625      if (p is aligned) {
1626         for (i=0; i<N; i++){    # loop 1A
1627           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1628           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1629         }
1630      }
1631      else {
1632         for (i=0; i<N; i++){    # loop 1B
1633           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1634           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1635         }
1636      }
1637
1638      -- Possibility 2: we do loop peeling:
1639      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1640         x = q[i];
1641         p[i] = y;
1642      }
1643      for (i = 3; i < N; i++){   # loop 2A
1644         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1645         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1646      }
1647
1648      -- Possibility 3: combination of loop peeling and versioning:
1649      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1650         x = q[i];
1651         p[i] = y;
1652      }
1653      if (p is aligned) {
1654         for (i = 3; i<N; i++){  # loop 3A
1655           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1656           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1657         }
1658      }
1659      else {
1660         for (i = 3; i<N; i++){  # loop 3B
1661           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1662           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1663         }
1664      }
1665
1666      These loops are later passed to loop_transform to be vectorized.  The
1667      vectorizer will use the alignment information to guide the transformation
1668      (whether to generate regular loads/stores, or with special handling for
1669      misalignment).  */
1670
1671 bool
1672 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1673 {
1674   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1675   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1676   enum dr_alignment_support supportable_dr_alignment;
1677   dr_vec_info *first_store = NULL;
1678   dr_vec_info *dr0_info = NULL;
1679   struct data_reference *dr;
1680   unsigned int i, j;
1681   bool do_peeling = false;
1682   bool do_versioning = false;
1683   bool stat;
1684   unsigned int npeel = 0;
1685   bool one_misalignment_known = false;
1686   bool one_misalignment_unknown = false;
1687   bool one_dr_unsupportable = false;
1688   dr_vec_info *unsupportable_dr_info = NULL;
1689   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1690   unsigned possible_npeel_number = 1;
1691   tree vectype;
1692   unsigned int mis, same_align_drs_max = 0;
1693   hash_table<peel_info_hasher> peeling_htab (1);
1694
1695   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1696
1697   /* Reset data so we can safely be called multiple times.  */
1698   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1699   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1700
1701   /* While cost model enhancements are expected in the future, the high level
1702      view of the code at this time is as follows:
1703
1704      A) If there is a misaligned access then see if peeling to align
1705         this access can make all data references satisfy
1706         vect_supportable_dr_alignment.  If so, update data structures
1707         as needed and return true.
1708
1709      B) If peeling wasn't possible and there is a data reference with an
1710         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1711         then see if loop versioning checks can be used to make all data
1712         references satisfy vect_supportable_dr_alignment.  If so, update
1713         data structures as needed and return true.
1714
1715      C) If neither peeling nor versioning were successful then return false if
1716         any data reference does not satisfy vect_supportable_dr_alignment.
1717
1718      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1719
1720      Note, Possibility 3 above (which is peeling and versioning together) is not
1721      being done at this time.  */
1722
1723   /* (1) Peeling to force alignment.  */
1724
1725   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1726      Considerations:
1727      + How many accesses will become aligned due to the peeling
1728      - How many accesses will become unaligned due to the peeling,
1729        and the cost of misaligned accesses.
1730      - The cost of peeling (the extra runtime checks, the increase
1731        in code size).  */
1732
1733   FOR_EACH_VEC_ELT (datarefs, i, dr)
1734     {
1735       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1736       stmt_vec_info stmt_info = dr_info->stmt;
1737
1738       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1739         continue;
1740
1741       /* For interleaving, only the alignment of the first access
1742          matters.  */
1743       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1744           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1745         continue;
1746
1747       /* For scatter-gather or invariant accesses there is nothing
1748          to enhance.  */
1749       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1750           || integer_zerop (DR_STEP (dr)))
1751         continue;
1752
1753       /* Strided accesses perform only component accesses, alignment is
1754          irrelevant for them.  */
1755       if (STMT_VINFO_STRIDED_P (stmt_info)
1756           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1757         continue;
1758
1759       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1760       do_peeling = vector_alignment_reachable_p (dr_info);
1761       if (do_peeling)
1762         {
1763           if (known_alignment_for_access_p (dr_info))
1764             {
1765               unsigned int npeel_tmp = 0;
1766               bool negative = tree_int_cst_compare (DR_STEP (dr),
1767                                                     size_zero_node) < 0;
1768
1769               vectype = STMT_VINFO_VECTYPE (stmt_info);
1770               unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1771               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1772               mis = (negative
1773                      ? DR_MISALIGNMENT (dr_info)
1774                      : -DR_MISALIGNMENT (dr_info));
1775               if (DR_MISALIGNMENT (dr_info) != 0)
1776                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1777
1778               /* For multiple types, it is possible that the bigger type access
1779                  will have more than one peeling option.  E.g., a loop with two
1780                  types: one of size (vector size / 4), and the other one of
1781                  size (vector size / 8).  Vectorization factor will 8.  If both
1782                  accesses are misaligned by 3, the first one needs one scalar
1783                  iteration to be aligned, and the second one needs 5.  But the
1784                  first one will be aligned also by peeling 5 scalar
1785                  iterations, and in that case both accesses will be aligned.
1786                  Hence, except for the immediate peeling amount, we also want
1787                  to try to add full vector size, while we don't exceed
1788                  vectorization factor.
1789                  We do this automatically for cost model, since we calculate
1790                  cost for every peeling option.  */
1791               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1792                 {
1793                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1794                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1795                   possible_npeel_number
1796                     = vect_get_num_vectors (nscalars, vectype);
1797
1798                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1799                      allow peeling NELEMENTS.  */
1800                   if (DR_MISALIGNMENT (dr_info) == 0)
1801                     possible_npeel_number++;
1802                 }
1803
1804               /* Save info about DR in the hash table.  Also include peeling
1805                  amounts according to the explanation above.  */
1806               for (j = 0; j < possible_npeel_number; j++)
1807                 {
1808                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1809                                             dr_info, npeel_tmp);
1810                   npeel_tmp += target_align / dr_size;
1811                 }
1812
1813               one_misalignment_known = true;
1814             }
1815           else
1816             {
1817               /* If we don't know any misalignment values, we prefer
1818                  peeling for data-ref that has the maximum number of data-refs
1819                  with the same alignment, unless the target prefers to align
1820                  stores over load.  */
1821               unsigned same_align_drs
1822                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1823               if (!dr0_info
1824                   || same_align_drs_max < same_align_drs)
1825                 {
1826                   same_align_drs_max = same_align_drs;
1827                   dr0_info = dr_info;
1828                 }
1829               /* For data-refs with the same number of related
1830                  accesses prefer the one where the misalign
1831                  computation will be invariant in the outermost loop.  */
1832               else if (same_align_drs_max == same_align_drs)
1833                 {
1834                   struct loop *ivloop0, *ivloop;
1835                   ivloop0 = outermost_invariant_loop_for_expr
1836                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1837                   ivloop = outermost_invariant_loop_for_expr
1838                     (loop, DR_BASE_ADDRESS (dr));
1839                   if ((ivloop && !ivloop0)
1840                       || (ivloop && ivloop0
1841                           && flow_loop_nested_p (ivloop, ivloop0)))
1842                     dr0_info = dr_info;
1843                 }
1844
1845               one_misalignment_unknown = true;
1846
1847               /* Check for data refs with unsupportable alignment that
1848                  can be peeled.  */
1849               if (!supportable_dr_alignment)
1850               {
1851                 one_dr_unsupportable = true;
1852                 unsupportable_dr_info = dr_info;
1853               }
1854
1855               if (!first_store && DR_IS_WRITE (dr))
1856                 first_store = dr_info;
1857             }
1858         }
1859       else
1860         {
1861           if (!aligned_access_p (dr_info))
1862             {
1863               if (dump_enabled_p ())
1864                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1865                                  "vector alignment may not be reachable\n");
1866               break;
1867             }
1868         }
1869     }
1870
1871   /* Check if we can possibly peel the loop.  */
1872   if (!vect_can_advance_ivs_p (loop_vinfo)
1873       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1874       || loop->inner)
1875     do_peeling = false;
1876
1877   struct _vect_peel_extended_info peel_for_known_alignment;
1878   struct _vect_peel_extended_info peel_for_unknown_alignment;
1879   struct _vect_peel_extended_info best_peel;
1880
1881   peel_for_unknown_alignment.inside_cost = INT_MAX;
1882   peel_for_unknown_alignment.outside_cost = INT_MAX;
1883   peel_for_unknown_alignment.peel_info.count = 0;
1884
1885   if (do_peeling
1886       && one_misalignment_unknown)
1887     {
1888       /* Check if the target requires to prefer stores over loads, i.e., if
1889          misaligned stores are more expensive than misaligned loads (taking
1890          drs with same alignment into account).  */
1891       unsigned int load_inside_cost = 0;
1892       unsigned int load_outside_cost = 0;
1893       unsigned int store_inside_cost = 0;
1894       unsigned int store_outside_cost = 0;
1895       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1896
1897       stmt_vector_for_cost dummy;
1898       dummy.create (2);
1899       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1900                                       &load_inside_cost,
1901                                       &load_outside_cost,
1902                                       &dummy, &dummy, estimated_npeels, true);
1903       dummy.release ();
1904
1905       if (first_store)
1906         {
1907           dummy.create (2);
1908           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1909                                           &store_inside_cost,
1910                                           &store_outside_cost,
1911                                           &dummy, &dummy,
1912                                           estimated_npeels, true);
1913           dummy.release ();
1914         }
1915       else
1916         {
1917           store_inside_cost = INT_MAX;
1918           store_outside_cost = INT_MAX;
1919         }
1920
1921       if (load_inside_cost > store_inside_cost
1922           || (load_inside_cost == store_inside_cost
1923               && load_outside_cost > store_outside_cost))
1924         {
1925           dr0_info = first_store;
1926           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1927           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1928         }
1929       else
1930         {
1931           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1932           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1933         }
1934
1935       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1936       prologue_cost_vec.create (2);
1937       epilogue_cost_vec.create (2);
1938
1939       int dummy2;
1940       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1941         (loop_vinfo, estimated_npeels, &dummy2,
1942          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1943          &prologue_cost_vec, &epilogue_cost_vec);
1944
1945       prologue_cost_vec.release ();
1946       epilogue_cost_vec.release ();
1947
1948       peel_for_unknown_alignment.peel_info.count = 1
1949         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1950     }
1951
1952   peel_for_unknown_alignment.peel_info.npeel = 0;
1953   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1954
1955   best_peel = peel_for_unknown_alignment;
1956
1957   peel_for_known_alignment.inside_cost = INT_MAX;
1958   peel_for_known_alignment.outside_cost = INT_MAX;
1959   peel_for_known_alignment.peel_info.count = 0;
1960   peel_for_known_alignment.peel_info.dr_info = NULL;
1961
1962   if (do_peeling && one_misalignment_known)
1963     {
1964       /* Peeling is possible, but there is no data access that is not supported
1965          unless aligned.  So we try to choose the best possible peeling from
1966          the hash table.  */
1967       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1968         (&peeling_htab, loop_vinfo);
1969     }
1970
1971   /* Compare costs of peeling for known and unknown alignment. */
1972   if (peel_for_known_alignment.peel_info.dr_info != NULL
1973       && peel_for_unknown_alignment.inside_cost
1974       >= peel_for_known_alignment.inside_cost)
1975     {
1976       best_peel = peel_for_known_alignment;
1977
1978       /* If the best peeling for known alignment has NPEEL == 0, perform no
1979          peeling at all except if there is an unsupportable dr that we can
1980          align.  */
1981       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1982         do_peeling = false;
1983     }
1984
1985   /* If there is an unsupportable data ref, prefer this over all choices so far
1986      since we'd have to discard a chosen peeling except when it accidentally
1987      aligned the unsupportable data ref.  */
1988   if (one_dr_unsupportable)
1989     dr0_info = unsupportable_dr_info;
1990   else if (do_peeling)
1991     {
1992       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1993          TODO: Use nopeel_outside_cost or get rid of it?  */
1994       unsigned nopeel_inside_cost = 0;
1995       unsigned nopeel_outside_cost = 0;
1996
1997       stmt_vector_for_cost dummy;
1998       dummy.create (2);
1999       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2000                                       &nopeel_outside_cost, &dummy, &dummy,
2001                                       0, false);
2002       dummy.release ();
2003
2004       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2005          costs will be recorded.  */
2006       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2007       prologue_cost_vec.create (2);
2008       epilogue_cost_vec.create (2);
2009
2010       int dummy2;
2011       nopeel_outside_cost += vect_get_known_peeling_cost
2012         (loop_vinfo, 0, &dummy2,
2013          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2014          &prologue_cost_vec, &epilogue_cost_vec);
2015
2016       prologue_cost_vec.release ();
2017       epilogue_cost_vec.release ();
2018
2019       npeel = best_peel.peel_info.npeel;
2020       dr0_info = best_peel.peel_info.dr_info;
2021
2022       /* If no peeling is not more expensive than the best peeling we
2023          have so far, don't perform any peeling.  */
2024       if (nopeel_inside_cost <= best_peel.inside_cost)
2025         do_peeling = false;
2026     }
2027
2028   if (do_peeling)
2029     {
2030       stmt_vec_info stmt_info = dr0_info->stmt;
2031       vectype = STMT_VINFO_VECTYPE (stmt_info);
2032
2033       if (known_alignment_for_access_p (dr0_info))
2034         {
2035           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2036                                                 size_zero_node) < 0;
2037           if (!npeel)
2038             {
2039               /* Since it's known at compile time, compute the number of
2040                  iterations in the peeled loop (the peeling factor) for use in
2041                  updating DR_MISALIGNMENT values.  The peeling factor is the
2042                  vectorization factor minus the misalignment as an element
2043                  count.  */
2044               mis = (negative
2045                      ? DR_MISALIGNMENT (dr0_info)
2046                      : -DR_MISALIGNMENT (dr0_info));
2047               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2048               npeel = ((mis & (target_align - 1))
2049                        / vect_get_scalar_dr_size (dr0_info));
2050             }
2051
2052           /* For interleaved data access every iteration accesses all the
2053              members of the group, therefore we divide the number of iterations
2054              by the group size.  */
2055           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056             npeel /= DR_GROUP_SIZE (stmt_info);
2057
2058           if (dump_enabled_p ())
2059             dump_printf_loc (MSG_NOTE, vect_location,
2060                              "Try peeling by %d\n", npeel);
2061         }
2062
2063       /* Ensure that all datarefs can be vectorized after the peel.  */
2064       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2065         do_peeling = false;
2066
2067       /* Check if all datarefs are supportable and log.  */
2068       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2069         {
2070           stat = vect_verify_datarefs_alignment (loop_vinfo);
2071           if (!stat)
2072             do_peeling = false;
2073           else
2074             return stat;
2075         }
2076
2077       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2078       if (do_peeling)
2079         {
2080           unsigned max_allowed_peel
2081             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2082           if (max_allowed_peel != (unsigned)-1)
2083             {
2084               unsigned max_peel = npeel;
2085               if (max_peel == 0)
2086                 {
2087                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2088                   max_peel = (target_align
2089                               / vect_get_scalar_dr_size (dr0_info) - 1);
2090                 }
2091               if (max_peel > max_allowed_peel)
2092                 {
2093                   do_peeling = false;
2094                   if (dump_enabled_p ())
2095                     dump_printf_loc (MSG_NOTE, vect_location,
2096                         "Disable peeling, max peels reached: %d\n", max_peel);
2097                 }
2098             }
2099         }
2100
2101       /* Cost model #2 - if peeling may result in a remaining loop not
2102          iterating enough to be vectorized then do not peel.  Since this
2103          is a cost heuristic rather than a correctness decision, use the
2104          most likely runtime value for variable vectorization factors.  */
2105       if (do_peeling
2106           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2107         {
2108           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2109           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2110           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2111               < assumed_vf + max_peel)
2112             do_peeling = false;
2113         }
2114
2115       if (do_peeling)
2116         {
2117           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2118              If the misalignment of DR_i is identical to that of dr0 then set
2119              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2120              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2121              by the peeling factor times the element size of DR_i (MOD the
2122              vectorization factor times the size).  Otherwise, the
2123              misalignment of DR_i must be set to unknown.  */
2124           FOR_EACH_VEC_ELT (datarefs, i, dr)
2125             if (dr != dr0_info->dr)
2126               {
2127                 /* Strided accesses perform only component accesses, alignment
2128                    is irrelevant for them.  */
2129                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2130                 stmt_info = dr_info->stmt;
2131                 if (STMT_VINFO_STRIDED_P (stmt_info)
2132                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2133                   continue;
2134
2135                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2136               }
2137
2138           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2139           if (npeel)
2140             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2141           else
2142             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2143               = DR_MISALIGNMENT (dr0_info);
2144           SET_DR_MISALIGNMENT (dr0_info, 0);
2145           if (dump_enabled_p ())
2146             {
2147               dump_printf_loc (MSG_NOTE, vect_location,
2148                                "Alignment of access forced using peeling.\n");
2149               dump_printf_loc (MSG_NOTE, vect_location,
2150                                "Peeling for alignment will be applied.\n");
2151             }
2152
2153           /* The inside-loop cost will be accounted for in vectorizable_load
2154              and vectorizable_store correctly with adjusted alignments.
2155              Drop the body_cst_vec on the floor here.  */
2156           stat = vect_verify_datarefs_alignment (loop_vinfo);
2157           gcc_assert (stat);
2158           return stat;
2159         }
2160     }
2161
2162   /* (2) Versioning to force alignment.  */
2163
2164   /* Try versioning if:
2165      1) optimize loop for speed
2166      2) there is at least one unsupported misaligned data ref with an unknown
2167         misalignment, and
2168      3) all misaligned data refs with a known misalignment are supported, and
2169      4) the number of runtime alignment checks is within reason.  */
2170
2171   do_versioning =
2172         optimize_loop_nest_for_speed_p (loop)
2173         && (!loop->inner); /* FORNOW */
2174
2175   if (do_versioning)
2176     {
2177       FOR_EACH_VEC_ELT (datarefs, i, dr)
2178         {
2179           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2180           stmt_vec_info stmt_info = dr_info->stmt;
2181
2182           /* For interleaving, only the alignment of the first access
2183              matters.  */
2184           if (aligned_access_p (dr_info)
2185               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2186                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2187             continue;
2188
2189           if (STMT_VINFO_STRIDED_P (stmt_info))
2190             {
2191               /* Strided loads perform only component accesses, alignment is
2192                  irrelevant for them.  */
2193               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2194                 continue;
2195               do_versioning = false;
2196               break;
2197             }
2198
2199           supportable_dr_alignment
2200             = vect_supportable_dr_alignment (dr_info, false);
2201
2202           if (!supportable_dr_alignment)
2203             {
2204               int mask;
2205               tree vectype;
2206
2207               if (known_alignment_for_access_p (dr_info)
2208                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2209                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2210                 {
2211                   do_versioning = false;
2212                   break;
2213                 }
2214
2215               vectype = STMT_VINFO_VECTYPE (stmt_info);
2216               gcc_assert (vectype);
2217
2218               /* At present we don't support versioning for alignment
2219                  with variable VF, since there's no guarantee that the
2220                  VF is a power of two.  We could relax this if we added
2221                  a way of enforcing a power-of-two size.  */
2222               unsigned HOST_WIDE_INT size;
2223               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2224                 {
2225                   do_versioning = false;
2226                   break;
2227                 }
2228
2229               /* The rightmost bits of an aligned address must be zeros.
2230                  Construct the mask needed for this test.  For example,
2231                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2232                  mask must be 15 = 0xf. */
2233               mask = size - 1;
2234
2235               /* FORNOW: use the same mask to test all potentially unaligned
2236                  references in the loop.  The vectorizer currently supports
2237                  a single vector size, see the reference to
2238                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2239                  vectorization factor is computed.  */
2240               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2241                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2242               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2243               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2244             }
2245         }
2246
2247       /* Versioning requires at least one misaligned data reference.  */
2248       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2249         do_versioning = false;
2250       else if (!do_versioning)
2251         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2252     }
2253
2254   if (do_versioning)
2255     {
2256       vec<stmt_vec_info> may_misalign_stmts
2257         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2258       stmt_vec_info stmt_info;
2259
2260       /* It can now be assumed that the data references in the statements
2261          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2262          of the loop being vectorized.  */
2263       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2264         {
2265           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2266           SET_DR_MISALIGNMENT (dr_info, 0);
2267           if (dump_enabled_p ())
2268             dump_printf_loc (MSG_NOTE, vect_location,
2269                              "Alignment of access forced using versioning.\n");
2270         }
2271
2272       if (dump_enabled_p ())
2273         dump_printf_loc (MSG_NOTE, vect_location,
2274                          "Versioning for alignment will be applied.\n");
2275
2276       /* Peeling and versioning can't be done together at this time.  */
2277       gcc_assert (! (do_peeling && do_versioning));
2278
2279       stat = vect_verify_datarefs_alignment (loop_vinfo);
2280       gcc_assert (stat);
2281       return stat;
2282     }
2283
2284   /* This point is reached if neither peeling nor versioning is being done.  */
2285   gcc_assert (! (do_peeling || do_versioning));
2286
2287   stat = vect_verify_datarefs_alignment (loop_vinfo);
2288   return stat;
2289 }
2290
2291
2292 /* Function vect_find_same_alignment_drs.
2293
2294    Update group and alignment relations in VINFO according to the chosen
2295    vectorization factor.  */
2296
2297 static void
2298 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2299 {
2300   struct data_reference *dra = DDR_A (ddr);
2301   struct data_reference *drb = DDR_B (ddr);
2302   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2303   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2304   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2305   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2306
2307   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2308     return;
2309
2310   if (dra == drb)
2311     return;
2312
2313   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2314       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2315     return;
2316
2317   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2318       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2319       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2320     return;
2321
2322   /* Two references with distance zero have the same alignment.  */
2323   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2324                           - wi::to_poly_offset (DR_INIT (drb)));
2325   if (maybe_ne (diff, 0))
2326     {
2327       /* Get the wider of the two alignments.  */
2328       unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2329                               / BITS_PER_UNIT);
2330       unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2331                               / BITS_PER_UNIT);
2332       unsigned int max_align = MAX (align_a, align_b);
2333
2334       /* Require the gap to be a multiple of the larger vector alignment.  */
2335       if (!multiple_p (diff, max_align))
2336         return;
2337     }
2338
2339   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2340   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2341   if (dump_enabled_p ())
2342     {
2343       dump_printf_loc (MSG_NOTE, vect_location,
2344                        "accesses have the same alignment: ");
2345       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2346       dump_printf (MSG_NOTE,  " and ");
2347       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2348       dump_printf (MSG_NOTE, "\n");
2349     }
2350 }
2351
2352
2353 /* Function vect_analyze_data_refs_alignment
2354
2355    Analyze the alignment of the data-references in the loop.
2356    Return FALSE if a data reference is found that cannot be vectorized.  */
2357
2358 bool
2359 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2360 {
2361   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2362
2363   /* Mark groups of data references with same alignment using
2364      data dependence information.  */
2365   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2366   struct data_dependence_relation *ddr;
2367   unsigned int i;
2368
2369   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2370     vect_find_same_alignment_drs (vinfo, ddr);
2371
2372   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2373   struct data_reference *dr;
2374
2375   vect_record_base_alignments (vinfo);
2376   FOR_EACH_VEC_ELT (datarefs, i, dr)
2377     {
2378       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2379       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2380         vect_compute_data_ref_alignment (dr_info);
2381     }
2382
2383   return true;
2384 }
2385
2386
2387 /* Analyze alignment of DRs of stmts in NODE.  */
2388
2389 static bool
2390 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2391 {
2392   /* We vectorize from the first scalar stmt in the node unless
2393      the node is permuted in which case we start from the first
2394      element in the group.  */
2395   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2396   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2397   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2398     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2399
2400   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2401   vect_compute_data_ref_alignment (dr_info);
2402   /* For creating the data-ref pointer we need alignment of the
2403      first element anyway.  */
2404   if (dr_info != first_dr_info)
2405     vect_compute_data_ref_alignment (first_dr_info);
2406   if (! verify_data_ref_alignment (dr_info))
2407     {
2408       if (dump_enabled_p ())
2409         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2410                          "not vectorized: bad data alignment in basic "
2411                          "block.\n");
2412       return false;
2413     }
2414
2415   return true;
2416 }
2417
2418 /* Function vect_slp_analyze_instance_alignment
2419
2420    Analyze the alignment of the data-references in the SLP instance.
2421    Return FALSE if a data reference is found that cannot be vectorized.  */
2422
2423 bool
2424 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2425 {
2426   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2427
2428   slp_tree node;
2429   unsigned i;
2430   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2431     if (! vect_slp_analyze_and_verify_node_alignment (node))
2432       return false;
2433
2434   node = SLP_INSTANCE_TREE (instance);
2435   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2436       && ! vect_slp_analyze_and_verify_node_alignment
2437              (SLP_INSTANCE_TREE (instance)))
2438     return false;
2439
2440   return true;
2441 }
2442
2443
2444 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2445    accesses of legal size, step, etc.  Detect gaps, single element
2446    interleaving, and other special cases. Set grouped access info.
2447    Collect groups of strided stores for further use in SLP analysis.
2448    Worker for vect_analyze_group_access.  */
2449
2450 static bool
2451 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2452 {
2453   data_reference *dr = dr_info->dr;
2454   tree step = DR_STEP (dr);
2455   tree scalar_type = TREE_TYPE (DR_REF (dr));
2456   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2457   stmt_vec_info stmt_info = dr_info->stmt;
2458   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2459   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2460   HOST_WIDE_INT dr_step = -1;
2461   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2462   bool slp_impossible = false;
2463
2464   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2465      size of the interleaving group (including gaps).  */
2466   if (tree_fits_shwi_p (step))
2467     {
2468       dr_step = tree_to_shwi (step);
2469       /* Check that STEP is a multiple of type size.  Otherwise there is
2470          a non-element-sized gap at the end of the group which we
2471          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2472          ???  As we can handle non-constant step fine here we should
2473          simply remove uses of DR_GROUP_GAP between the last and first
2474          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2475          simply not include that gap.  */
2476       if ((dr_step % type_size) != 0)
2477         {
2478           if (dump_enabled_p ())
2479             {
2480               dump_printf_loc (MSG_NOTE, vect_location,
2481                                "Step ");
2482               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2483               dump_printf (MSG_NOTE,
2484                            " is not a multiple of the element size for ");
2485               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2486               dump_printf (MSG_NOTE, "\n");
2487             }
2488           return false;
2489         }
2490       groupsize = absu_hwi (dr_step) / type_size;
2491     }
2492   else
2493     groupsize = 0;
2494
2495   /* Not consecutive access is possible only if it is a part of interleaving.  */
2496   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2497     {
2498       /* Check if it this DR is a part of interleaving, and is a single
2499          element of the group that is accessed in the loop.  */
2500
2501       /* Gaps are supported only for loads. STEP must be a multiple of the type
2502          size.  */
2503       if (DR_IS_READ (dr)
2504           && (dr_step % type_size) == 0
2505           && groupsize > 0)
2506         {
2507           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2508           DR_GROUP_SIZE (stmt_info) = groupsize;
2509           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2510           if (dump_enabled_p ())
2511             {
2512               dump_printf_loc (MSG_NOTE, vect_location,
2513                                "Detected single element interleaving ");
2514               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2515               dump_printf (MSG_NOTE, " step ");
2516               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2517               dump_printf (MSG_NOTE, "\n");
2518             }
2519
2520           return true;
2521         }
2522
2523       if (dump_enabled_p ())
2524         {
2525           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2526                            "not consecutive access ");
2527           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2528                             stmt_info->stmt, 0);
2529         }
2530
2531       if (bb_vinfo)
2532         {
2533           /* Mark the statement as unvectorizable.  */
2534           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2535           return true;
2536         }
2537
2538       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2539       STMT_VINFO_STRIDED_P (stmt_info) = true;
2540       return true;
2541     }
2542
2543   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2544     {
2545       /* First stmt in the interleaving chain. Check the chain.  */
2546       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2547       struct data_reference *data_ref = dr;
2548       unsigned int count = 1;
2549       tree prev_init = DR_INIT (data_ref);
2550       stmt_vec_info prev = stmt_info;
2551       HOST_WIDE_INT diff, gaps = 0;
2552
2553       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2554       while (next)
2555         {
2556           /* Skip same data-refs.  In case that two or more stmts share
2557              data-ref (supported only for loads), we vectorize only the first
2558              stmt, and the rest get their vectorized loads from the first
2559              one.  */
2560           if (!tree_int_cst_compare (DR_INIT (data_ref),
2561                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2562             {
2563               if (DR_IS_WRITE (data_ref))
2564                 {
2565                   if (dump_enabled_p ())
2566                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567                                      "Two store stmts share the same dr.\n");
2568                   return false;
2569                 }
2570
2571               if (dump_enabled_p ())
2572                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2573                                  "Two or more load stmts share the same dr.\n");
2574
2575               /* For load use the same data-ref load.  */
2576               DR_GROUP_SAME_DR_STMT (next) = prev;
2577
2578               prev = next;
2579               next = DR_GROUP_NEXT_ELEMENT (next);
2580               continue;
2581             }
2582
2583           prev = next;
2584           data_ref = STMT_VINFO_DATA_REF (next);
2585
2586           /* All group members have the same STEP by construction.  */
2587           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2588
2589           /* Check that the distance between two accesses is equal to the type
2590              size. Otherwise, we have gaps.  */
2591           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2592                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2593           if (diff != 1)
2594             {
2595               /* FORNOW: SLP of accesses with gaps is not supported.  */
2596               slp_impossible = true;
2597               if (DR_IS_WRITE (data_ref))
2598                 {
2599                   if (dump_enabled_p ())
2600                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601                                      "interleaved store with gaps\n");
2602                   return false;
2603                 }
2604
2605               gaps += diff - 1;
2606             }
2607
2608           last_accessed_element += diff;
2609
2610           /* Store the gap from the previous member of the group. If there is no
2611              gap in the access, DR_GROUP_GAP is always 1.  */
2612           DR_GROUP_GAP (next) = diff;
2613
2614           prev_init = DR_INIT (data_ref);
2615           next = DR_GROUP_NEXT_ELEMENT (next);
2616           /* Count the number of data-refs in the chain.  */
2617           count++;
2618         }
2619
2620       if (groupsize == 0)
2621         groupsize = count + gaps;
2622
2623       /* This could be UINT_MAX but as we are generating code in a very
2624          inefficient way we have to cap earlier.  See PR78699 for example.  */
2625       if (groupsize > 4096)
2626         {
2627           if (dump_enabled_p ())
2628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2629                              "group is too large\n");
2630           return false;
2631         }
2632
2633       /* Check that the size of the interleaving is equal to count for stores,
2634          i.e., that there are no gaps.  */
2635       if (groupsize != count
2636           && !DR_IS_READ (dr))
2637         {
2638           if (dump_enabled_p ())
2639             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2640                              "interleaved store with gaps\n");
2641           return false;
2642         }
2643
2644       /* If there is a gap after the last load in the group it is the
2645          difference between the groupsize and the last accessed
2646          element.
2647          When there is no gap, this difference should be 0.  */
2648       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2649
2650       DR_GROUP_SIZE (stmt_info) = groupsize;
2651       if (dump_enabled_p ())
2652         {
2653           dump_printf_loc (MSG_NOTE, vect_location,
2654                            "Detected interleaving ");
2655           if (DR_IS_READ (dr))
2656             dump_printf (MSG_NOTE, "load ");
2657           else
2658             dump_printf (MSG_NOTE, "store ");
2659           dump_printf (MSG_NOTE, "of size %u starting with ",
2660                        (unsigned)groupsize);
2661           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2662           if (DR_GROUP_GAP (stmt_info) != 0)
2663             dump_printf_loc (MSG_NOTE, vect_location,
2664                              "There is a gap of %u elements after the group\n",
2665                              DR_GROUP_GAP (stmt_info));
2666         }
2667
2668       /* SLP: create an SLP data structure for every interleaving group of
2669          stores for further analysis in vect_analyse_slp.  */
2670       if (DR_IS_WRITE (dr) && !slp_impossible)
2671         {
2672           if (loop_vinfo)
2673             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2674           if (bb_vinfo)
2675             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2676         }
2677     }
2678
2679   return true;
2680 }
2681
2682 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2683    accesses of legal size, step, etc.  Detect gaps, single element
2684    interleaving, and other special cases. Set grouped access info.
2685    Collect groups of strided stores for further use in SLP analysis.  */
2686
2687 static bool
2688 vect_analyze_group_access (dr_vec_info *dr_info)
2689 {
2690   if (!vect_analyze_group_access_1 (dr_info))
2691     {
2692       /* Dissolve the group if present.  */
2693       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2694       while (stmt_info)
2695         {
2696           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2697           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2698           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2699           stmt_info = next;
2700         }
2701       return false;
2702     }
2703   return true;
2704 }
2705
2706 /* Analyze the access pattern of the data-reference DR_INFO.
2707    In case of non-consecutive accesses call vect_analyze_group_access() to
2708    analyze groups of accesses.  */
2709
2710 static bool
2711 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2712 {
2713   data_reference *dr = dr_info->dr;
2714   tree step = DR_STEP (dr);
2715   tree scalar_type = TREE_TYPE (DR_REF (dr));
2716   stmt_vec_info stmt_info = dr_info->stmt;
2717   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2718   struct loop *loop = NULL;
2719
2720   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2721     return true;
2722
2723   if (loop_vinfo)
2724     loop = LOOP_VINFO_LOOP (loop_vinfo);
2725
2726   if (loop_vinfo && !step)
2727     {
2728       if (dump_enabled_p ())
2729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2730                          "bad data-ref access in loop\n");
2731       return false;
2732     }
2733
2734   /* Allow loads with zero step in inner-loop vectorization.  */
2735   if (loop_vinfo && integer_zerop (step))
2736     {
2737       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2738       if (!nested_in_vect_loop_p (loop, stmt_info))
2739         return DR_IS_READ (dr);
2740       /* Allow references with zero step for outer loops marked
2741          with pragma omp simd only - it guarantees absence of
2742          loop-carried dependencies between inner loop iterations.  */
2743       if (loop->safelen < 2)
2744         {
2745           if (dump_enabled_p ())
2746             dump_printf_loc (MSG_NOTE, vect_location,
2747                              "zero step in inner loop of nest\n");
2748           return false;
2749         }
2750     }
2751
2752   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2753     {
2754       /* Interleaved accesses are not yet supported within outer-loop
2755         vectorization for references in the inner-loop.  */
2756       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2757
2758       /* For the rest of the analysis we use the outer-loop step.  */
2759       step = STMT_VINFO_DR_STEP (stmt_info);
2760       if (integer_zerop (step))
2761         {
2762           if (dump_enabled_p ())
2763             dump_printf_loc (MSG_NOTE, vect_location,
2764                              "zero step in outer loop.\n");
2765           return DR_IS_READ (dr);
2766         }
2767     }
2768
2769   /* Consecutive?  */
2770   if (TREE_CODE (step) == INTEGER_CST)
2771     {
2772       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2773       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2774           || (dr_step < 0
2775               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2776         {
2777           /* Mark that it is not interleaving.  */
2778           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2779           return true;
2780         }
2781     }
2782
2783   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2784     {
2785       if (dump_enabled_p ())
2786         dump_printf_loc (MSG_NOTE, vect_location,
2787                          "grouped access in outer loop.\n");
2788       return false;
2789     }
2790
2791
2792   /* Assume this is a DR handled by non-constant strided load case.  */
2793   if (TREE_CODE (step) != INTEGER_CST)
2794     return (STMT_VINFO_STRIDED_P (stmt_info)
2795             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2796                 || vect_analyze_group_access (dr_info)));
2797
2798   /* Not consecutive access - check if it's a part of interleaving group.  */
2799   return vect_analyze_group_access (dr_info);
2800 }
2801
2802 /* Compare two data-references DRA and DRB to group them into chunks
2803    suitable for grouping.  */
2804
2805 static int
2806 dr_group_sort_cmp (const void *dra_, const void *drb_)
2807 {
2808   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2809   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2810   int cmp;
2811
2812   /* Stabilize sort.  */
2813   if (dra == drb)
2814     return 0;
2815
2816   /* DRs in different loops never belong to the same group.  */
2817   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2818   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2819   if (loopa != loopb)
2820     return loopa->num < loopb->num ? -1 : 1;
2821
2822   /* Ordering of DRs according to base.  */
2823   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2824                                DR_BASE_ADDRESS (drb));
2825   if (cmp != 0)
2826     return cmp;
2827
2828   /* And according to DR_OFFSET.  */
2829   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2830   if (cmp != 0)
2831     return cmp;
2832
2833   /* Put reads before writes.  */
2834   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2835     return DR_IS_READ (dra) ? -1 : 1;
2836
2837   /* Then sort after access size.  */
2838   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2839                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2840   if (cmp != 0)
2841     return cmp;
2842
2843   /* And after step.  */
2844   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2845   if (cmp != 0)
2846     return cmp;
2847
2848   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2849   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2850   if (cmp == 0)
2851     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2852   return cmp;
2853 }
2854
2855 /* If OP is the result of a conversion, return the unconverted value,
2856    otherwise return null.  */
2857
2858 static tree
2859 strip_conversion (tree op)
2860 {
2861   if (TREE_CODE (op) != SSA_NAME)
2862     return NULL_TREE;
2863   gimple *stmt = SSA_NAME_DEF_STMT (op);
2864   if (!is_gimple_assign (stmt)
2865       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2866     return NULL_TREE;
2867   return gimple_assign_rhs1 (stmt);
2868 }
2869
2870 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2871    and STMT2_INFO being in a single group.  */
2872
2873 static bool
2874 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2875 {
2876   if (gimple_assign_single_p (stmt1_info->stmt))
2877     return gimple_assign_single_p (stmt2_info->stmt);
2878
2879   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2880   if (call1 && gimple_call_internal_p (call1))
2881     {
2882       /* Check for two masked loads or two masked stores.  */
2883       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2884       if (!call2 || !gimple_call_internal_p (call2))
2885         return false;
2886       internal_fn ifn = gimple_call_internal_fn (call1);
2887       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2888         return false;
2889       if (ifn != gimple_call_internal_fn (call2))
2890         return false;
2891
2892       /* Check that the masks are the same.  Cope with casts of masks,
2893          like those created by build_mask_conversion.  */
2894       tree mask1 = gimple_call_arg (call1, 2);
2895       tree mask2 = gimple_call_arg (call2, 2);
2896       if (!operand_equal_p (mask1, mask2, 0))
2897         {
2898           mask1 = strip_conversion (mask1);
2899           if (!mask1)
2900             return false;
2901           mask2 = strip_conversion (mask2);
2902           if (!mask2)
2903             return false;
2904           if (!operand_equal_p (mask1, mask2, 0))
2905             return false;
2906         }
2907       return true;
2908     }
2909
2910   return false;
2911 }
2912
2913 /* Function vect_analyze_data_ref_accesses.
2914
2915    Analyze the access pattern of all the data references in the loop.
2916
2917    FORNOW: the only access pattern that is considered vectorizable is a
2918            simple step 1 (consecutive) access.
2919
2920    FORNOW: handle only arrays and pointer accesses.  */
2921
2922 bool
2923 vect_analyze_data_ref_accesses (vec_info *vinfo)
2924 {
2925   unsigned int i;
2926   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2927   struct data_reference *dr;
2928
2929   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2930
2931   if (datarefs.is_empty ())
2932     return true;
2933
2934   /* Sort the array of datarefs to make building the interleaving chains
2935      linear.  Don't modify the original vector's order, it is needed for
2936      determining what dependencies are reversed.  */
2937   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2938   datarefs_copy.qsort (dr_group_sort_cmp);
2939
2940   /* Build the interleaving chains.  */
2941   for (i = 0; i < datarefs_copy.length () - 1;)
2942     {
2943       data_reference_p dra = datarefs_copy[i];
2944       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2945       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2946       stmt_vec_info lastinfo = NULL;
2947       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2948           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2949         {
2950           ++i;
2951           continue;
2952         }
2953       for (i = i + 1; i < datarefs_copy.length (); ++i)
2954         {
2955           data_reference_p drb = datarefs_copy[i];
2956           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2957           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2958           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2959               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2960             break;
2961
2962           /* ???  Imperfect sorting (non-compatible types, non-modulo
2963              accesses, same accesses) can lead to a group to be artificially
2964              split here as we don't just skip over those.  If it really
2965              matters we can push those to a worklist and re-iterate
2966              over them.  The we can just skip ahead to the next DR here.  */
2967
2968           /* DRs in a different loop should not be put into the same
2969              interleaving group.  */
2970           if (gimple_bb (DR_STMT (dra))->loop_father
2971               != gimple_bb (DR_STMT (drb))->loop_father)
2972             break;
2973
2974           /* Check that the data-refs have same first location (except init)
2975              and they are both either store or load (not load and store,
2976              not masked loads or stores).  */
2977           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2978               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2979                                         DR_BASE_ADDRESS (drb)) != 0
2980               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2981               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2982             break;
2983
2984           /* Check that the data-refs have the same constant size.  */
2985           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2986           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2987           if (!tree_fits_uhwi_p (sza)
2988               || !tree_fits_uhwi_p (szb)
2989               || !tree_int_cst_equal (sza, szb))
2990             break;
2991
2992           /* Check that the data-refs have the same step.  */
2993           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2994             break;
2995
2996           /* Check the types are compatible.
2997              ???  We don't distinguish this during sorting.  */
2998           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2999                                    TREE_TYPE (DR_REF (drb))))
3000             break;
3001
3002           /* Check that the DR_INITs are compile-time constants.  */
3003           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3004               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3005             break;
3006
3007           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3008           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3009           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3010           HOST_WIDE_INT init_prev
3011             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3012           gcc_assert (init_a <= init_b
3013                       && init_a <= init_prev
3014                       && init_prev <= init_b);
3015
3016           /* Do not place the same access in the interleaving chain twice.  */
3017           if (init_b == init_prev)
3018             {
3019               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3020                           < gimple_uid (DR_STMT (drb)));
3021               /* ???  For now we simply "drop" the later reference which is
3022                  otherwise the same rather than finishing off this group.
3023                  In the end we'd want to re-process duplicates forming
3024                  multiple groups from the refs, likely by just collecting
3025                  all candidates (including duplicates and split points
3026                  below) in a vector and then process them together.  */
3027               continue;
3028             }
3029
3030           /* If init_b == init_a + the size of the type * k, we have an
3031              interleaving, and DRA is accessed before DRB.  */
3032           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3033           if (type_size_a == 0
3034               || (init_b - init_a) % type_size_a != 0)
3035             break;
3036
3037           /* If we have a store, the accesses are adjacent.  This splits
3038              groups into chunks we support (we don't support vectorization
3039              of stores with gaps).  */
3040           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3041             break;
3042
3043           /* If the step (if not zero or non-constant) is greater than the
3044              difference between data-refs' inits this splits groups into
3045              suitable sizes.  */
3046           if (tree_fits_shwi_p (DR_STEP (dra)))
3047             {
3048               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3049               if (step != 0 && step <= (init_b - init_a))
3050                 break;
3051             }
3052
3053           if (dump_enabled_p ())
3054             {
3055               dump_printf_loc (MSG_NOTE, vect_location,
3056                                "Detected interleaving ");
3057               if (DR_IS_READ (dra))
3058                 dump_printf (MSG_NOTE, "load ");
3059               else
3060                 dump_printf (MSG_NOTE, "store ");
3061               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3062               dump_printf (MSG_NOTE,  " and ");
3063               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3064               dump_printf (MSG_NOTE, "\n");
3065             }
3066
3067           /* Link the found element into the group list.  */
3068           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3069             {
3070               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3071               lastinfo = stmtinfo_a;
3072             }
3073           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3074           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3075           lastinfo = stmtinfo_b;
3076         }
3077     }
3078
3079   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3080     {
3081       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3082       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3083           && !vect_analyze_data_ref_access (dr_info))
3084         {
3085           if (dump_enabled_p ())
3086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3087                              "not vectorized: complicated access pattern.\n");
3088
3089           if (is_a <bb_vec_info> (vinfo))
3090             {
3091               /* Mark the statement as not vectorizable.  */
3092               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3093               continue;
3094             }
3095           else
3096             {
3097               datarefs_copy.release ();
3098               return false;
3099             }
3100         }
3101     }
3102
3103   datarefs_copy.release ();
3104   return true;
3105 }
3106
3107 /* Function vect_vfa_segment_size.
3108
3109    Input:
3110      DR_INFO: The data reference.
3111      LENGTH_FACTOR: segment length to consider.
3112
3113    Return a value suitable for the dr_with_seg_len::seg_len field.
3114    This is the "distance travelled" by the pointer from the first
3115    iteration in the segment to the last.  Note that it does not include
3116    the size of the access; in effect it only describes the first byte.  */
3117
3118 static tree
3119 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3120 {
3121   length_factor = size_binop (MINUS_EXPR,
3122                               fold_convert (sizetype, length_factor),
3123                               size_one_node);
3124   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3125                      length_factor);
3126 }
3127
3128 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3129    gives the worst-case number of bytes covered by the segment.  */
3130
3131 static unsigned HOST_WIDE_INT
3132 vect_vfa_access_size (dr_vec_info *dr_info)
3133 {
3134   stmt_vec_info stmt_vinfo = dr_info->stmt;
3135   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3136   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3137   unsigned HOST_WIDE_INT access_size = ref_size;
3138   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3139     {
3140       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3141       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3142     }
3143   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3144       && (vect_supportable_dr_alignment (dr_info, false)
3145           == dr_explicit_realign_optimized))
3146     {
3147       /* We might access a full vector's worth.  */
3148       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3149       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3150     }
3151   return access_size;
3152 }
3153
3154 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3155    describes.  */
3156
3157 static unsigned int
3158 vect_vfa_align (dr_vec_info *dr_info)
3159 {
3160   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3161 }
3162
3163 /* Function vect_no_alias_p.
3164
3165    Given data references A and B with equal base and offset, see whether
3166    the alias relation can be decided at compilation time.  Return 1 if
3167    it can and the references alias, 0 if it can and the references do
3168    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3169    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3170    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3171
3172 static int
3173 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3174                          tree segment_length_a, tree segment_length_b,
3175                          unsigned HOST_WIDE_INT access_size_a,
3176                          unsigned HOST_WIDE_INT access_size_b)
3177 {
3178   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3179   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3180   poly_uint64 const_length_a;
3181   poly_uint64 const_length_b;
3182
3183   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3184      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3185      [a, a+12) */
3186   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3187     {
3188       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3189       offset_a = (offset_a + access_size_a) - const_length_a;
3190     }
3191   else
3192     const_length_a = tree_to_poly_uint64 (segment_length_a);
3193   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3194     {
3195       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3196       offset_b = (offset_b + access_size_b) - const_length_b;
3197     }
3198   else
3199     const_length_b = tree_to_poly_uint64 (segment_length_b);
3200
3201   const_length_a += access_size_a;
3202   const_length_b += access_size_b;
3203
3204   if (ranges_known_overlap_p (offset_a, const_length_a,
3205                               offset_b, const_length_b))
3206     return 1;
3207
3208   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3209                                offset_b, const_length_b))
3210     return 0;
3211
3212   return -1;
3213 }
3214
3215 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3216    in DDR is >= VF.  */
3217
3218 static bool
3219 dependence_distance_ge_vf (data_dependence_relation *ddr,
3220                            unsigned int loop_depth, poly_uint64 vf)
3221 {
3222   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3223       || DDR_NUM_DIST_VECTS (ddr) == 0)
3224     return false;
3225
3226   /* If the dependence is exact, we should have limited the VF instead.  */
3227   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3228
3229   unsigned int i;
3230   lambda_vector dist_v;
3231   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3232     {
3233       HOST_WIDE_INT dist = dist_v[loop_depth];
3234       if (dist != 0
3235           && !(dist > 0 && DDR_REVERSED_P (ddr))
3236           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3237         return false;
3238     }
3239
3240   if (dump_enabled_p ())
3241     {
3242       dump_printf_loc (MSG_NOTE, vect_location,
3243                        "dependence distance between ");
3244       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3245       dump_printf (MSG_NOTE,  " and ");
3246       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3247       dump_printf (MSG_NOTE,  " is >= VF\n");
3248     }
3249
3250   return true;
3251 }
3252
3253 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3254
3255 static void
3256 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3257 {
3258   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3259   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3260   dump_printf (dump_kind, ") >= ");
3261   dump_dec (dump_kind, lower_bound.min_value);
3262 }
3263
3264 /* Record that the vectorized loop requires the vec_lower_bound described
3265    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3266
3267 static void
3268 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3269                         poly_uint64 min_value)
3270 {
3271   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3272   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3273     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3274       {
3275         unsigned_p &= lower_bounds[i].unsigned_p;
3276         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3277         if (lower_bounds[i].unsigned_p != unsigned_p
3278             || maybe_lt (lower_bounds[i].min_value, min_value))
3279           {
3280             lower_bounds[i].unsigned_p = unsigned_p;
3281             lower_bounds[i].min_value = min_value;
3282             if (dump_enabled_p ())
3283               {
3284                 dump_printf_loc (MSG_NOTE, vect_location,
3285                                  "updating run-time check to ");
3286                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3287                 dump_printf (MSG_NOTE, "\n");
3288               }
3289           }
3290         return;
3291       }
3292
3293   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3294   if (dump_enabled_p ())
3295     {
3296       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3297       dump_lower_bound (MSG_NOTE, lower_bound);
3298       dump_printf (MSG_NOTE, "\n");
3299     }
3300   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3301 }
3302
3303 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3304    will span fewer than GAP bytes.  */
3305
3306 static bool
3307 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3308                   poly_int64 gap)
3309 {
3310   stmt_vec_info stmt_info = dr_info->stmt;
3311   HOST_WIDE_INT count
3312     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3313   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3314     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3315   return (estimated_poly_value (gap)
3316           <= count * vect_get_scalar_dr_size (dr_info));
3317 }
3318
3319 /* Return true if we know that there is no alias between DR_INFO_A and
3320    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3321    When returning true, set *LOWER_BOUND_OUT to this N.  */
3322
3323 static bool
3324 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3325                                 poly_uint64 *lower_bound_out)
3326 {
3327   /* Check that there is a constant gap of known sign between DR_A
3328      and DR_B.  */
3329   data_reference *dr_a = dr_info_a->dr;
3330   data_reference *dr_b = dr_info_b->dr;
3331   poly_int64 init_a, init_b;
3332   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3333       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3334       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3335       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3336       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3337       || !ordered_p (init_a, init_b))
3338     return false;
3339
3340   /* Sort DR_A and DR_B by the address they access.  */
3341   if (maybe_lt (init_b, init_a))
3342     {
3343       std::swap (init_a, init_b);
3344       std::swap (dr_info_a, dr_info_b);
3345       std::swap (dr_a, dr_b);
3346     }
3347
3348   /* If the two accesses could be dependent within a scalar iteration,
3349      make sure that we'd retain their order.  */
3350   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3351       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3352     return false;
3353
3354   /* There is no alias if abs (DR_STEP) is greater than or equal to
3355      the bytes spanned by the combination of the two accesses.  */
3356   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3357   return true;
3358 }
3359
3360 /* Function vect_prune_runtime_alias_test_list.
3361
3362    Prune a list of ddrs to be tested at run-time by versioning for alias.
3363    Merge several alias checks into one if possible.
3364    Return FALSE if resulting list of ddrs is longer then allowed by
3365    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3366
3367 bool
3368 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3369 {
3370   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3371   hash_set <tree_pair_hash> compared_objects;
3372
3373   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3374   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3375     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3376   vec<vec_object_pair> &check_unequal_addrs
3377     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3378   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3379   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3380
3381   ddr_p ddr;
3382   unsigned int i;
3383   tree length_factor;
3384
3385   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3386
3387   /* Step values are irrelevant for aliasing if the number of vector
3388      iterations is equal to the number of scalar iterations (which can
3389      happen for fully-SLP loops).  */
3390   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3391
3392   if (!ignore_step_p)
3393     {
3394       /* Convert the checks for nonzero steps into bound tests.  */
3395       tree value;
3396       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3397         vect_check_lower_bound (loop_vinfo, value, true, 1);
3398     }
3399
3400   if (may_alias_ddrs.is_empty ())
3401     return true;
3402
3403   comp_alias_ddrs.create (may_alias_ddrs.length ());
3404
3405   unsigned int loop_depth
3406     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3407                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3408
3409   /* First, we collect all data ref pairs for aliasing checks.  */
3410   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3411     {
3412       int comp_res;
3413       poly_uint64 lower_bound;
3414       tree segment_length_a, segment_length_b;
3415       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3416       unsigned int align_a, align_b;
3417
3418       /* Ignore the alias if the VF we chose ended up being no greater
3419          than the dependence distance.  */
3420       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3421         continue;
3422
3423       if (DDR_OBJECT_A (ddr))
3424         {
3425           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3426           if (!compared_objects.add (new_pair))
3427             {
3428               if (dump_enabled_p ())
3429                 {
3430                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3431                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3432                   dump_printf (MSG_NOTE, " and ");
3433                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3434                   dump_printf (MSG_NOTE, " have different addresses\n");
3435                 }
3436               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3437             }
3438           continue;
3439         }
3440
3441       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3442       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3443
3444       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3445       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3446
3447       /* Skip the pair if inter-iteration dependencies are irrelevant
3448          and intra-iteration dependencies are guaranteed to be honored.  */
3449       if (ignore_step_p
3450           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3451               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3452                                                  &lower_bound)))
3453         {
3454           if (dump_enabled_p ())
3455             {
3456               dump_printf_loc (MSG_NOTE, vect_location,
3457                                "no need for alias check between ");
3458               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3459               dump_printf (MSG_NOTE, " and ");
3460               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3461               dump_printf (MSG_NOTE, " when VF is 1\n");
3462             }
3463           continue;
3464         }
3465
3466       /* See whether we can handle the alias using a bounds check on
3467          the step, and whether that's likely to be the best approach.
3468          (It might not be, for example, if the minimum step is much larger
3469          than the number of bytes handled by one vector iteration.)  */
3470       if (!ignore_step_p
3471           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3472           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3473                                              &lower_bound)
3474           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3475               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3476         {
3477           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3478           if (dump_enabled_p ())
3479             {
3480               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3481               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3482               dump_printf (MSG_NOTE, " and ");
3483               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3484               dump_printf (MSG_NOTE, " when the step ");
3485               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3486               dump_printf (MSG_NOTE, " is outside ");
3487               if (unsigned_p)
3488                 dump_printf (MSG_NOTE, "[0");
3489               else
3490                 {
3491                   dump_printf (MSG_NOTE, "(");
3492                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3493                 }
3494               dump_printf (MSG_NOTE, ", ");
3495               dump_dec (MSG_NOTE, lower_bound);
3496               dump_printf (MSG_NOTE, ")\n");
3497             }
3498           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3499                                   unsigned_p, lower_bound);
3500           continue;
3501         }
3502
3503       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3504       if (dr_group_first_a)
3505         {
3506           stmt_info_a = dr_group_first_a;
3507           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3508         }
3509
3510       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3511       if (dr_group_first_b)
3512         {
3513           stmt_info_b = dr_group_first_b;
3514           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3515         }
3516
3517       if (ignore_step_p)
3518         {
3519           segment_length_a = size_zero_node;
3520           segment_length_b = size_zero_node;
3521         }
3522       else
3523         {
3524           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3525                                 DR_STEP (dr_info_b->dr), 0))
3526             length_factor = scalar_loop_iters;
3527           else
3528             length_factor = size_int (vect_factor);
3529           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3530           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3531         }
3532       access_size_a = vect_vfa_access_size (dr_info_a);
3533       access_size_b = vect_vfa_access_size (dr_info_b);
3534       align_a = vect_vfa_align (dr_info_a);
3535       align_b = vect_vfa_align (dr_info_b);
3536
3537       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3538                                         DR_BASE_ADDRESS (dr_info_b->dr));
3539       if (comp_res == 0)
3540         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3541                                           DR_OFFSET (dr_info_b->dr));
3542
3543       /* See whether the alias is known at compilation time.  */
3544       if (comp_res == 0
3545           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3546           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3547           && poly_int_tree_p (segment_length_a)
3548           && poly_int_tree_p (segment_length_b))
3549         {
3550           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3551                                              segment_length_a,
3552                                              segment_length_b,
3553                                              access_size_a,
3554                                              access_size_b);
3555           if (res >= 0 && dump_enabled_p ())
3556             {
3557               dump_printf_loc (MSG_NOTE, vect_location,
3558                                "can tell at compile time that ");
3559               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3560               dump_printf (MSG_NOTE, " and ");
3561               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3562               if (res == 0)
3563                 dump_printf (MSG_NOTE, " do not alias\n");
3564               else
3565                 dump_printf (MSG_NOTE, " alias\n");
3566             }
3567
3568           if (res == 0)
3569             continue;
3570
3571           if (res == 1)
3572             {
3573               if (dump_enabled_p ())
3574                 dump_printf_loc (MSG_NOTE, vect_location,
3575                                  "not vectorized: compilation time alias.\n");
3576               return false;
3577             }
3578         }
3579
3580       dr_with_seg_len_pair_t dr_with_seg_len_pair
3581         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3582                           access_size_a, align_a),
3583          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3584                           access_size_b, align_b));
3585
3586       /* Canonicalize pairs by sorting the two DR members.  */
3587       if (comp_res > 0)
3588         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3589
3590       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3591     }
3592
3593   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3594
3595   unsigned int count = (comp_alias_ddrs.length ()
3596                         + check_unequal_addrs.length ());
3597
3598   dump_printf_loc (MSG_NOTE, vect_location,
3599                    "improved number of alias checks from %d to %d\n",
3600                    may_alias_ddrs.length (), count);
3601   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3602     {
3603       if (dump_enabled_p ())
3604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3605                          "number of versioning for alias "
3606                          "run-time tests exceeds %d "
3607                          "(--param vect-max-version-for-alias-checks)\n",
3608                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3609       return false;
3610     }
3611
3612   return true;
3613 }
3614
3615 /* Check whether we can use an internal function for a gather load
3616    or scatter store.  READ_P is true for loads and false for stores.
3617    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3618    the type of the memory elements being loaded or stored.  OFFSET_BITS
3619    is the number of bits in each scalar offset and OFFSET_SIGN is the
3620    sign of the offset.  SCALE is the amount by which the offset should
3621    be multiplied *after* it has been converted to address width.
3622
3623    Return true if the function is supported, storing the function
3624    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3625
3626 bool
3627 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3628                           tree memory_type, unsigned int offset_bits,
3629                           signop offset_sign, int scale,
3630                           internal_fn *ifn_out, tree *element_type_out)
3631 {
3632   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3633   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3634   if (offset_bits > element_bits)
3635     /* Internal functions require the offset to be the same width as
3636        the vector elements.  We can extend narrower offsets, but it isn't
3637        safe to truncate wider offsets.  */
3638     return false;
3639
3640   if (element_bits != memory_bits)
3641     /* For now the vector elements must be the same width as the
3642        memory elements.  */
3643     return false;
3644
3645   /* Work out which function we need.  */
3646   internal_fn ifn;
3647   if (read_p)
3648     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3649   else
3650     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3651
3652   /* Test whether the target supports this combination.  */
3653   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3654                                                offset_sign, scale))
3655     return false;
3656
3657   *ifn_out = ifn;
3658   *element_type_out = TREE_TYPE (vectype);
3659   return true;
3660 }
3661
3662 /* STMT_INFO is a call to an internal gather load or scatter store function.
3663    Describe the operation in INFO.  */
3664
3665 static void
3666 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3667                                    gather_scatter_info *info)
3668 {
3669   gcall *call = as_a <gcall *> (stmt_info->stmt);
3670   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3671   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3672
3673   info->ifn = gimple_call_internal_fn (call);
3674   info->decl = NULL_TREE;
3675   info->base = gimple_call_arg (call, 0);
3676   info->offset = gimple_call_arg (call, 1);
3677   info->offset_dt = vect_unknown_def_type;
3678   info->offset_vectype = NULL_TREE;
3679   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3680   info->element_type = TREE_TYPE (vectype);
3681   info->memory_type = TREE_TYPE (DR_REF (dr));
3682 }
3683
3684 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3685    gather load or scatter store.  Describe the operation in *INFO if so.  */
3686
3687 bool
3688 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3689                            gather_scatter_info *info)
3690 {
3691   HOST_WIDE_INT scale = 1;
3692   poly_int64 pbitpos, pbitsize;
3693   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3694   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3695   tree offtype = NULL_TREE;
3696   tree decl = NULL_TREE, base, off;
3697   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3698   tree memory_type = TREE_TYPE (DR_REF (dr));
3699   machine_mode pmode;
3700   int punsignedp, reversep, pvolatilep = 0;
3701   internal_fn ifn;
3702   tree element_type;
3703   bool masked_p = false;
3704
3705   /* See whether this is already a call to a gather/scatter internal function.
3706      If not, see whether it's a masked load or store.  */
3707   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3708   if (call && gimple_call_internal_p (call))
3709     {
3710       ifn = gimple_call_internal_fn (call);
3711       if (internal_gather_scatter_fn_p (ifn))
3712         {
3713           vect_describe_gather_scatter_call (stmt_info, info);
3714           return true;
3715         }
3716       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3717     }
3718
3719   /* True if we should aim to use internal functions rather than
3720      built-in functions.  */
3721   bool use_ifn_p = (DR_IS_READ (dr)
3722                     ? supports_vec_gather_load_p ()
3723                     : supports_vec_scatter_store_p ());
3724
3725   base = DR_REF (dr);
3726   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3727      see if we can use the def stmt of the address.  */
3728   if (masked_p
3729       && TREE_CODE (base) == MEM_REF
3730       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3731       && integer_zerop (TREE_OPERAND (base, 1))
3732       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3733     {
3734       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3735       if (is_gimple_assign (def_stmt)
3736           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3737         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3738     }
3739
3740   /* The gather and scatter builtins need address of the form
3741      loop_invariant + vector * {1, 2, 4, 8}
3742      or
3743      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3744      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3745      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3746      multiplications and additions in it.  To get a vector, we need
3747      a single SSA_NAME that will be defined in the loop and will
3748      contain everything that is not loop invariant and that can be
3749      vectorized.  The following code attempts to find such a preexistng
3750      SSA_NAME OFF and put the loop invariants into a tree BASE
3751      that can be gimplified before the loop.  */
3752   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3753                               &punsignedp, &reversep, &pvolatilep);
3754   if (reversep)
3755     return false;
3756
3757   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3758
3759   if (TREE_CODE (base) == MEM_REF)
3760     {
3761       if (!integer_zerop (TREE_OPERAND (base, 1)))
3762         {
3763           if (off == NULL_TREE)
3764             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3765           else
3766             off = size_binop (PLUS_EXPR, off,
3767                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3768         }
3769       base = TREE_OPERAND (base, 0);
3770     }
3771   else
3772     base = build_fold_addr_expr (base);
3773
3774   if (off == NULL_TREE)
3775     off = size_zero_node;
3776
3777   /* If base is not loop invariant, either off is 0, then we start with just
3778      the constant offset in the loop invariant BASE and continue with base
3779      as OFF, otherwise give up.
3780      We could handle that case by gimplifying the addition of base + off
3781      into some SSA_NAME and use that as off, but for now punt.  */
3782   if (!expr_invariant_in_loop_p (loop, base))
3783     {
3784       if (!integer_zerop (off))
3785         return false;
3786       off = base;
3787       base = size_int (pbytepos);
3788     }
3789   /* Otherwise put base + constant offset into the loop invariant BASE
3790      and continue with OFF.  */
3791   else
3792     {
3793       base = fold_convert (sizetype, base);
3794       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3795     }
3796
3797   /* OFF at this point may be either a SSA_NAME or some tree expression
3798      from get_inner_reference.  Try to peel off loop invariants from it
3799      into BASE as long as possible.  */
3800   STRIP_NOPS (off);
3801   while (offtype == NULL_TREE)
3802     {
3803       enum tree_code code;
3804       tree op0, op1, add = NULL_TREE;
3805
3806       if (TREE_CODE (off) == SSA_NAME)
3807         {
3808           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3809
3810           if (expr_invariant_in_loop_p (loop, off))
3811             return false;
3812
3813           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3814             break;
3815
3816           op0 = gimple_assign_rhs1 (def_stmt);
3817           code = gimple_assign_rhs_code (def_stmt);
3818           op1 = gimple_assign_rhs2 (def_stmt);
3819         }
3820       else
3821         {
3822           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3823             return false;
3824           code = TREE_CODE (off);
3825           extract_ops_from_tree (off, &code, &op0, &op1);
3826         }
3827       switch (code)
3828         {
3829         case POINTER_PLUS_EXPR:
3830         case PLUS_EXPR:
3831           if (expr_invariant_in_loop_p (loop, op0))
3832             {
3833               add = op0;
3834               off = op1;
3835             do_add:
3836               add = fold_convert (sizetype, add);
3837               if (scale != 1)
3838                 add = size_binop (MULT_EXPR, add, size_int (scale));
3839               base = size_binop (PLUS_EXPR, base, add);
3840               continue;
3841             }
3842           if (expr_invariant_in_loop_p (loop, op1))
3843             {
3844               add = op1;
3845               off = op0;
3846               goto do_add;
3847             }
3848           break;
3849         case MINUS_EXPR:
3850           if (expr_invariant_in_loop_p (loop, op1))
3851             {
3852               add = fold_convert (sizetype, op1);
3853               add = size_binop (MINUS_EXPR, size_zero_node, add);
3854               off = op0;
3855               goto do_add;
3856             }
3857           break;
3858         case MULT_EXPR:
3859           if (scale == 1 && tree_fits_shwi_p (op1))
3860             {
3861               int new_scale = tree_to_shwi (op1);
3862               /* Only treat this as a scaling operation if the target
3863                  supports it.  */
3864               if (use_ifn_p
3865                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3866                                                 vectype, memory_type, 1,
3867                                                 TYPE_SIGN (TREE_TYPE (op0)),
3868                                                 new_scale, &ifn,
3869                                                 &element_type))
3870                 break;
3871               scale = new_scale;
3872               off = op0;
3873               continue;
3874             }
3875           break;
3876         case SSA_NAME:
3877           off = op0;
3878           continue;
3879         CASE_CONVERT:
3880           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3881               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3882             break;
3883           if (TYPE_PRECISION (TREE_TYPE (op0))
3884               == TYPE_PRECISION (TREE_TYPE (off)))
3885             {
3886               off = op0;
3887               continue;
3888             }
3889
3890           /* The internal functions need the offset to be the same width
3891              as the elements of VECTYPE.  Don't include operations that
3892              cast the offset from that width to a different width.  */
3893           if (use_ifn_p
3894               && (int_size_in_bytes (TREE_TYPE (vectype))
3895                   == int_size_in_bytes (TREE_TYPE (off))))
3896             break;
3897
3898           if (TYPE_PRECISION (TREE_TYPE (op0))
3899               < TYPE_PRECISION (TREE_TYPE (off)))
3900             {
3901               off = op0;
3902               offtype = TREE_TYPE (off);
3903               STRIP_NOPS (off);
3904               continue;
3905             }
3906           break;
3907         default:
3908           break;
3909         }
3910       break;
3911     }
3912
3913   /* If at the end OFF still isn't a SSA_NAME or isn't
3914      defined in the loop, punt.  */
3915   if (TREE_CODE (off) != SSA_NAME
3916       || expr_invariant_in_loop_p (loop, off))
3917     return false;
3918
3919   if (offtype == NULL_TREE)
3920     offtype = TREE_TYPE (off);
3921
3922   if (use_ifn_p)
3923     {
3924       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3925                                      memory_type, TYPE_PRECISION (offtype),
3926                                      TYPE_SIGN (offtype), scale, &ifn,
3927                                      &element_type))
3928         return false;
3929     }
3930   else
3931     {
3932       if (DR_IS_READ (dr))
3933         {
3934           if (targetm.vectorize.builtin_gather)
3935             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3936         }
3937       else
3938         {
3939           if (targetm.vectorize.builtin_scatter)
3940             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3941         }
3942
3943       if (!decl)
3944         return false;
3945
3946       ifn = IFN_LAST;
3947       element_type = TREE_TYPE (vectype);
3948     }
3949
3950   info->ifn = ifn;
3951   info->decl = decl;
3952   info->base = base;
3953   info->offset = off;
3954   info->offset_dt = vect_unknown_def_type;
3955   info->offset_vectype = NULL_TREE;
3956   info->scale = scale;
3957   info->element_type = element_type;
3958   info->memory_type = memory_type;
3959   return true;
3960 }
3961
3962 /* Find the data references in STMT, analyze them with respect to LOOP and
3963    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3964    be handled.  */
3965
3966 bool
3967 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3968                                vec<data_reference_p> *datarefs)
3969 {
3970   /* We can ignore clobbers for dataref analysis - they are removed during
3971      loop vectorization and BB vectorization checks dependences with a
3972      stmt walk.  */
3973   if (gimple_clobber_p (stmt))
3974     return true;
3975
3976   if (gimple_has_volatile_ops (stmt))
3977     {
3978       if (dump_enabled_p ())
3979         {
3980           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3981                            "not vectorized: volatile type ");
3982           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3983         }
3984       return false;
3985     }
3986
3987   if (stmt_can_throw_internal (stmt))
3988     {
3989       if (dump_enabled_p ())
3990         {
3991           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3992                            "not vectorized: statement can throw an "
3993                            "exception ");
3994           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3995         }
3996       return false;
3997     }
3998
3999   auto_vec<data_reference_p, 2> refs;
4000   if (!find_data_references_in_stmt (loop, stmt, &refs))
4001     return false;
4002
4003   if (refs.is_empty ())
4004     return true;
4005
4006   if (refs.length () > 1)
4007     {
4008       if (dump_enabled_p ())
4009         {
4010           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4011                            "not vectorized: more than one data ref "
4012                            "in stmt: ");
4013           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4014         }
4015       return false;
4016     }
4017
4018   if (gcall *call = dyn_cast <gcall *> (stmt))
4019     if (!gimple_call_internal_p (call)
4020         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4021             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4022       {
4023         if (dump_enabled_p ())
4024           {
4025             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4026                              "not vectorized: dr in a call ");
4027             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4028           }
4029         return false;
4030       }
4031
4032   data_reference_p dr = refs.pop ();
4033   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4034       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4035     {
4036       if (dump_enabled_p ())
4037         {
4038           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4039                            "not vectorized: statement is bitfield "
4040                            "access ");
4041           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4042         }
4043       return false;
4044     }
4045
4046   if (DR_BASE_ADDRESS (dr)
4047       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4048     {
4049       if (dump_enabled_p ())
4050         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4051                          "not vectorized: base addr of dr is a "
4052                          "constant\n");
4053       return false;
4054     }
4055
4056   /* Check whether this may be a SIMD lane access and adjust the
4057      DR to make it easier for us to handle it.  */
4058   if (loop
4059       && loop->simduid
4060       && (!DR_BASE_ADDRESS (dr)
4061           || !DR_OFFSET (dr)
4062           || !DR_INIT (dr)
4063           || !DR_STEP (dr)))
4064     {
4065       struct data_reference *newdr
4066         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4067                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4068       if (DR_BASE_ADDRESS (newdr)
4069           && DR_OFFSET (newdr)
4070           && DR_INIT (newdr)
4071           && DR_STEP (newdr)
4072           && integer_zerop (DR_STEP (newdr)))
4073         {
4074           tree off = DR_OFFSET (newdr);
4075           STRIP_NOPS (off);
4076           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4077               && TREE_CODE (off) == MULT_EXPR
4078               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4079             {
4080               tree step = TREE_OPERAND (off, 1);
4081               off = TREE_OPERAND (off, 0);
4082               STRIP_NOPS (off);
4083               if (CONVERT_EXPR_P (off)
4084                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4085                       < TYPE_PRECISION (TREE_TYPE (off))))
4086                 off = TREE_OPERAND (off, 0);
4087               if (TREE_CODE (off) == SSA_NAME)
4088                 {
4089                   gimple *def = SSA_NAME_DEF_STMT (off);
4090                   tree reft = TREE_TYPE (DR_REF (newdr));
4091                   if (is_gimple_call (def)
4092                       && gimple_call_internal_p (def)
4093                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4094                     {
4095                       tree arg = gimple_call_arg (def, 0);
4096                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4097                       arg = SSA_NAME_VAR (arg);
4098                       if (arg == loop->simduid
4099                           /* For now.  */
4100                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4101                         {
4102                           DR_OFFSET (newdr) = ssize_int (0);
4103                           DR_STEP (newdr) = step;
4104                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4105                           DR_STEP_ALIGNMENT (newdr)
4106                             = highest_pow2_factor (step);
4107                           /* Mark as simd-lane access.  */
4108                           newdr->aux = (void *)-1;
4109                           free_data_ref (dr);
4110                           datarefs->safe_push (newdr);
4111                           return true;
4112                         }
4113                     }
4114                 }
4115             }
4116         }
4117       free_data_ref (newdr);
4118     }
4119
4120   datarefs->safe_push (dr);
4121   return true;
4122 }
4123
4124 /* Function vect_analyze_data_refs.
4125
4126   Find all the data references in the loop or basic block.
4127
4128    The general structure of the analysis of data refs in the vectorizer is as
4129    follows:
4130    1- vect_analyze_data_refs(loop/bb): call
4131       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4132       in the loop/bb and their dependences.
4133    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4134    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4135    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4136
4137 */
4138
4139 bool
4140 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4141 {
4142   struct loop *loop = NULL;
4143   unsigned int i;
4144   struct data_reference *dr;
4145   tree scalar_type;
4146
4147   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4148
4149   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4150     loop = LOOP_VINFO_LOOP (loop_vinfo);
4151
4152   /* Go through the data-refs, check that the analysis succeeded.  Update
4153      pointer from stmt_vec_info struct to DR and vectype.  */
4154
4155   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4156   FOR_EACH_VEC_ELT (datarefs, i, dr)
4157     {
4158       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4159       poly_uint64 vf;
4160
4161       gcc_assert (DR_REF (dr));
4162       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4163       gcc_assert (!stmt_info->dr_aux.dr);
4164       stmt_info->dr_aux.dr = dr;
4165       stmt_info->dr_aux.stmt = stmt_info;
4166
4167       /* Check that analysis of the data-ref succeeded.  */
4168       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4169           || !DR_STEP (dr))
4170         {
4171           bool maybe_gather
4172             = DR_IS_READ (dr)
4173               && !TREE_THIS_VOLATILE (DR_REF (dr))
4174               && (targetm.vectorize.builtin_gather != NULL
4175                   || supports_vec_gather_load_p ());
4176           bool maybe_scatter
4177             = DR_IS_WRITE (dr)
4178               && !TREE_THIS_VOLATILE (DR_REF (dr))
4179               && (targetm.vectorize.builtin_scatter != NULL
4180                   || supports_vec_scatter_store_p ());
4181
4182           /* If target supports vector gather loads or scatter stores,
4183              see if they can't be used.  */
4184           if (is_a <loop_vec_info> (vinfo)
4185               && !nested_in_vect_loop_p (loop, stmt_info))
4186             {
4187               if (maybe_gather || maybe_scatter)
4188                 {
4189                   if (maybe_gather)
4190                     gatherscatter = GATHER;
4191                   else
4192                     gatherscatter = SCATTER;
4193                 }
4194             }
4195
4196           if (gatherscatter == SG_NONE)
4197             {
4198               if (dump_enabled_p ())
4199                 {
4200                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4201                                    "not vectorized: data ref analysis "
4202                                    "failed ");
4203                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4204                                     stmt_info->stmt, 0);
4205                 }
4206               if (is_a <bb_vec_info> (vinfo))
4207                 {
4208                   /* In BB vectorization the ref can still participate
4209                      in dependence analysis, we just can't vectorize it.  */
4210                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4211                   continue;
4212                 }
4213               return false;
4214             }
4215         }
4216
4217       /* See if this was detected as SIMD lane access.  */
4218       if (dr->aux == (void *)-1)
4219         {
4220           if (nested_in_vect_loop_p (loop, stmt_info))
4221             {
4222               if (dump_enabled_p ())
4223                 {
4224                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4225                                    "not vectorized: data ref analysis "
4226                                    "failed ");
4227                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4228                                     stmt_info->stmt, 0);
4229                 }
4230               return false;
4231             }
4232           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4233         }
4234
4235       tree base = get_base_address (DR_REF (dr));
4236       if (base && VAR_P (base) && DECL_NONALIASED (base))
4237         {
4238           if (dump_enabled_p ())
4239             {
4240               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4241                                "not vectorized: base object not addressable "
4242                                "for stmt: ");
4243               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4244                                 stmt_info->stmt, 0);
4245             }
4246           if (is_a <bb_vec_info> (vinfo))
4247             {
4248               /* In BB vectorization the ref can still participate
4249                  in dependence analysis, we just can't vectorize it.  */
4250               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4251               continue;
4252             }
4253           return false;
4254         }
4255
4256       if (is_a <loop_vec_info> (vinfo)
4257           && DR_STEP (dr)
4258           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4259         {
4260           if (nested_in_vect_loop_p (loop, stmt_info))
4261             {
4262               if (dump_enabled_p ())
4263                 {
4264                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265                                    "not vectorized: not suitable for strided "
4266                                    "load ");
4267                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4268                                     stmt_info->stmt, 0);
4269                 }
4270               return false;
4271             }
4272           STMT_VINFO_STRIDED_P (stmt_info) = true;
4273         }
4274
4275       /* Update DR field in stmt_vec_info struct.  */
4276
4277       /* If the dataref is in an inner-loop of the loop that is considered for
4278          for vectorization, we also want to analyze the access relative to
4279          the outer-loop (DR contains information only relative to the
4280          inner-most enclosing loop).  We do that by building a reference to the
4281          first location accessed by the inner-loop, and analyze it relative to
4282          the outer-loop.  */
4283       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4284         {
4285           /* Build a reference to the first location accessed by the
4286              inner loop: *(BASE + INIT + OFFSET).  By construction,
4287              this address must be invariant in the inner loop, so we
4288              can consider it as being used in the outer loop.  */
4289           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4290           tree offset = unshare_expr (DR_OFFSET (dr));
4291           tree init = unshare_expr (DR_INIT (dr));
4292           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4293                                           init, offset);
4294           tree init_addr = fold_build_pointer_plus (base, init_offset);
4295           tree init_ref = build_fold_indirect_ref (init_addr);
4296
4297           if (dump_enabled_p ())
4298             {
4299               dump_printf_loc (MSG_NOTE, vect_location,
4300                                "analyze in outer loop: ");
4301               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4302               dump_printf (MSG_NOTE, "\n");
4303             }
4304
4305           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4306                                      init_ref, loop))
4307             /* dr_analyze_innermost already explained the failure.  */
4308             return false;
4309
4310           if (dump_enabled_p ())
4311             {
4312               dump_printf_loc (MSG_NOTE, vect_location,
4313                                "\touter base_address: ");
4314               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4315                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4316               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4317               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4318                                  STMT_VINFO_DR_OFFSET (stmt_info));
4319               dump_printf (MSG_NOTE,
4320                            "\n\touter constant offset from base address: ");
4321               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4322                                  STMT_VINFO_DR_INIT (stmt_info));
4323               dump_printf (MSG_NOTE, "\n\touter step: ");
4324               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4325                                  STMT_VINFO_DR_STEP (stmt_info));
4326               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4327                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4328               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4329                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4330               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4331                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4332               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4333                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4334             }
4335         }
4336
4337       /* Set vectype for STMT.  */
4338       scalar_type = TREE_TYPE (DR_REF (dr));
4339       STMT_VINFO_VECTYPE (stmt_info)
4340         = get_vectype_for_scalar_type (scalar_type);
4341       if (!STMT_VINFO_VECTYPE (stmt_info))
4342         {
4343           if (dump_enabled_p ())
4344             {
4345               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4346                                "not vectorized: no vectype for stmt: ");
4347               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4348                                 stmt_info->stmt, 0);
4349               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4350               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4351                                  scalar_type);
4352               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4353             }
4354
4355           if (is_a <bb_vec_info> (vinfo))
4356             {
4357               /* No vector type is fine, the ref can still participate
4358                  in dependence analysis, we just can't vectorize it.  */
4359               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4360               continue;
4361             }
4362           return false;
4363         }
4364       else
4365         {
4366           if (dump_enabled_p ())
4367             {
4368               dump_printf_loc (MSG_NOTE, vect_location,
4369                                "got vectype for stmt: ");
4370               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4371               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4372                                  STMT_VINFO_VECTYPE (stmt_info));
4373               dump_printf (MSG_NOTE, "\n");
4374             }
4375         }
4376
4377       /* Adjust the minimal vectorization factor according to the
4378          vector type.  */
4379       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4380       *min_vf = upper_bound (*min_vf, vf);
4381
4382       if (gatherscatter != SG_NONE)
4383         {
4384           gather_scatter_info gs_info;
4385           if (!vect_check_gather_scatter (stmt_info,
4386                                           as_a <loop_vec_info> (vinfo),
4387                                           &gs_info)
4388               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4389             {
4390               if (dump_enabled_p ())
4391                 {
4392                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4393                                    (gatherscatter == GATHER) ?
4394                                    "not vectorized: not suitable for gather "
4395                                    "load " :
4396                                    "not vectorized: not suitable for scatter "
4397                                    "store ");
4398                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4399                                     stmt_info->stmt, 0);
4400                 }
4401               return false;
4402             }
4403           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4404         }
4405     }
4406
4407   /* We used to stop processing and prune the list here.  Verify we no
4408      longer need to.  */
4409   gcc_assert (i == datarefs.length ());
4410
4411   return true;
4412 }
4413
4414
4415 /* Function vect_get_new_vect_var.
4416
4417    Returns a name for a new variable.  The current naming scheme appends the
4418    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4419    the name of vectorizer generated variables, and appends that to NAME if
4420    provided.  */
4421
4422 tree
4423 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4424 {
4425   const char *prefix;
4426   tree new_vect_var;
4427
4428   switch (var_kind)
4429   {
4430   case vect_simple_var:
4431     prefix = "vect";
4432     break;
4433   case vect_scalar_var:
4434     prefix = "stmp";
4435     break;
4436   case vect_mask_var:
4437     prefix = "mask";
4438     break;
4439   case vect_pointer_var:
4440     prefix = "vectp";
4441     break;
4442   default:
4443     gcc_unreachable ();
4444   }
4445
4446   if (name)
4447     {
4448       char* tmp = concat (prefix, "_", name, NULL);
4449       new_vect_var = create_tmp_reg (type, tmp);
4450       free (tmp);
4451     }
4452   else
4453     new_vect_var = create_tmp_reg (type, prefix);
4454
4455   return new_vect_var;
4456 }
4457
4458 /* Like vect_get_new_vect_var but return an SSA name.  */
4459
4460 tree
4461 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4462 {
4463   const char *prefix;
4464   tree new_vect_var;
4465
4466   switch (var_kind)
4467   {
4468   case vect_simple_var:
4469     prefix = "vect";
4470     break;
4471   case vect_scalar_var:
4472     prefix = "stmp";
4473     break;
4474   case vect_pointer_var:
4475     prefix = "vectp";
4476     break;
4477   default:
4478     gcc_unreachable ();
4479   }
4480
4481   if (name)
4482     {
4483       char* tmp = concat (prefix, "_", name, NULL);
4484       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4485       free (tmp);
4486     }
4487   else
4488     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4489
4490   return new_vect_var;
4491 }
4492
4493 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4494
4495 static void
4496 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4497 {
4498   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4499   int misalign = DR_MISALIGNMENT (dr_info);
4500   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4501     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4502   else
4503     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4504                             DR_TARGET_ALIGNMENT (dr_info), misalign);
4505 }
4506
4507 /* Function vect_create_addr_base_for_vector_ref.
4508
4509    Create an expression that computes the address of the first memory location
4510    that will be accessed for a data reference.
4511
4512    Input:
4513    STMT_INFO: The statement containing the data reference.
4514    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4515    OFFSET: Optional. If supplied, it is be added to the initial address.
4516    LOOP:    Specify relative to which loop-nest should the address be computed.
4517             For example, when the dataref is in an inner-loop nested in an
4518             outer-loop that is now being vectorized, LOOP can be either the
4519             outer-loop, or the inner-loop.  The first memory location accessed
4520             by the following dataref ('in' points to short):
4521
4522                 for (i=0; i<N; i++)
4523                    for (j=0; j<M; j++)
4524                      s += in[i+j]
4525
4526             is as follows:
4527             if LOOP=i_loop:     &in             (relative to i_loop)
4528             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4529    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4530             initial address.  Unlike OFFSET, which is number of elements to
4531             be added, BYTE_OFFSET is measured in bytes.
4532
4533    Output:
4534    1. Return an SSA_NAME whose value is the address of the memory location of
4535       the first vector of the data reference.
4536    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4537       these statement(s) which define the returned SSA_NAME.
4538
4539    FORNOW: We are only handling array accesses with step 1.  */
4540
4541 tree
4542 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4543                                       gimple_seq *new_stmt_list,
4544                                       tree offset,
4545                                       tree byte_offset)
4546 {
4547   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4548   struct data_reference *dr = dr_info->dr;
4549   const char *base_name;
4550   tree addr_base;
4551   tree dest;
4552   gimple_seq seq = NULL;
4553   tree vect_ptr_type;
4554   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4555   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4556   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4557
4558   tree data_ref_base = unshare_expr (drb->base_address);
4559   tree base_offset = unshare_expr (drb->offset);
4560   tree init = unshare_expr (drb->init);
4561
4562   if (loop_vinfo)
4563     base_name = get_name (data_ref_base);
4564   else
4565     {
4566       base_offset = ssize_int (0);
4567       init = ssize_int (0);
4568       base_name = get_name (DR_REF (dr));
4569     }
4570
4571   /* Create base_offset */
4572   base_offset = size_binop (PLUS_EXPR,
4573                             fold_convert (sizetype, base_offset),
4574                             fold_convert (sizetype, init));
4575
4576   if (offset)
4577     {
4578       offset = fold_build2 (MULT_EXPR, sizetype,
4579                             fold_convert (sizetype, offset), step);
4580       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4581                                  base_offset, offset);
4582     }
4583   if (byte_offset)
4584     {
4585       byte_offset = fold_convert (sizetype, byte_offset);
4586       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4587                                  base_offset, byte_offset);
4588     }
4589
4590   /* base + base_offset */
4591   if (loop_vinfo)
4592     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4593   else
4594     {
4595       addr_base = build1 (ADDR_EXPR,
4596                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4597                           unshare_expr (DR_REF (dr)));
4598     }
4599
4600   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4601   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4602   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4603   gimple_seq_add_seq (new_stmt_list, seq);
4604
4605   if (DR_PTR_INFO (dr)
4606       && TREE_CODE (addr_base) == SSA_NAME
4607       && !SSA_NAME_PTR_INFO (addr_base))
4608     {
4609       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4610       if (offset || byte_offset)
4611         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4612     }
4613
4614   if (dump_enabled_p ())
4615     {
4616       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4617       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4618       dump_printf (MSG_NOTE, "\n");
4619     }
4620
4621   return addr_base;
4622 }
4623
4624
4625 /* Function vect_create_data_ref_ptr.
4626
4627    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4628    location accessed in the loop by STMT_INFO, along with the def-use update
4629    chain to appropriately advance the pointer through the loop iterations.
4630    Also set aliasing information for the pointer.  This pointer is used by
4631    the callers to this function to create a memory reference expression for
4632    vector load/store access.
4633
4634    Input:
4635    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4636          GIMPLE_ASSIGN <name, data-ref> or
4637          GIMPLE_ASSIGN <data-ref, name>.
4638    2. AGGR_TYPE: the type of the reference, which should be either a vector
4639         or an array.
4640    3. AT_LOOP: the loop where the vector memref is to be created.
4641    4. OFFSET (optional): an offset to be added to the initial address accessed
4642         by the data-ref in STMT_INFO.
4643    5. BSI: location where the new stmts are to be placed if there is no loop
4644    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4645         pointing to the initial address.
4646    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4647         to the initial address accessed by the data-ref in STMT_INFO.  This is
4648         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4649         in bytes.
4650    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4651         to the IV during each iteration of the loop.  NULL says to move
4652         by one copy of AGGR_TYPE up or down, depending on the step of the
4653         data reference.
4654
4655    Output:
4656    1. Declare a new ptr to vector_type, and have it point to the base of the
4657       data reference (initial addressed accessed by the data reference).
4658       For example, for vector of type V8HI, the following code is generated:
4659
4660       v8hi *ap;
4661       ap = (v8hi *)initial_address;
4662
4663       if OFFSET is not supplied:
4664          initial_address = &a[init];
4665       if OFFSET is supplied:
4666          initial_address = &a[init + OFFSET];
4667       if BYTE_OFFSET is supplied:
4668          initial_address = &a[init] + BYTE_OFFSET;
4669
4670       Return the initial_address in INITIAL_ADDRESS.
4671
4672    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4673       update the pointer in each iteration of the loop.
4674
4675       Return the increment stmt that updates the pointer in PTR_INCR.
4676
4677    3. Return the pointer.  */
4678
4679 tree
4680 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4681                           struct loop *at_loop, tree offset,
4682                           tree *initial_address, gimple_stmt_iterator *gsi,
4683                           gimple **ptr_incr, bool only_init,
4684                           tree byte_offset, tree iv_step)
4685 {
4686   const char *base_name;
4687   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4688   struct loop *loop = NULL;
4689   bool nested_in_vect_loop = false;
4690   struct loop *containing_loop = NULL;
4691   tree aggr_ptr_type;
4692   tree aggr_ptr;
4693   tree new_temp;
4694   gimple_seq new_stmt_list = NULL;
4695   edge pe = NULL;
4696   basic_block new_bb;
4697   tree aggr_ptr_init;
4698   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4699   struct data_reference *dr = dr_info->dr;
4700   tree aptr;
4701   gimple_stmt_iterator incr_gsi;
4702   bool insert_after;
4703   tree indx_before_incr, indx_after_incr;
4704   gimple *incr;
4705   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4706
4707   gcc_assert (iv_step != NULL_TREE
4708               || TREE_CODE (aggr_type) == ARRAY_TYPE
4709               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4710
4711   if (loop_vinfo)
4712     {
4713       loop = LOOP_VINFO_LOOP (loop_vinfo);
4714       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4715       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4716       pe = loop_preheader_edge (loop);
4717     }
4718   else
4719     {
4720       gcc_assert (bb_vinfo);
4721       only_init = true;
4722       *ptr_incr = NULL;
4723     }
4724
4725   /* Create an expression for the first address accessed by this load
4726      in LOOP.  */
4727   base_name = get_name (DR_BASE_ADDRESS (dr));
4728
4729   if (dump_enabled_p ())
4730     {
4731       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4732       dump_printf_loc (MSG_NOTE, vect_location,
4733                        "create %s-pointer variable to type: ",
4734                        get_tree_code_name (TREE_CODE (aggr_type)));
4735       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4736       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4737         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4738       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4739         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4740       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4741         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4742       else
4743         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4744       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4745       dump_printf (MSG_NOTE, "\n");
4746     }
4747
4748   /* (1) Create the new aggregate-pointer variable.
4749      Vector and array types inherit the alias set of their component
4750      type by default so we need to use a ref-all pointer if the data
4751      reference does not conflict with the created aggregated data
4752      reference because it is not addressable.  */
4753   bool need_ref_all = false;
4754   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4755                               get_alias_set (DR_REF (dr))))
4756     need_ref_all = true;
4757   /* Likewise for any of the data references in the stmt group.  */
4758   else if (DR_GROUP_SIZE (stmt_info) > 1)
4759     {
4760       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4761       do
4762         {
4763           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765                                       get_alias_set (DR_REF (sdr))))
4766             {
4767               need_ref_all = true;
4768               break;
4769             }
4770           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4771         }
4772       while (sinfo);
4773     }
4774   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775                                                need_ref_all);
4776   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4777
4778
4779   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781      def-use update cycles for the pointer: one relative to the outer-loop
4782      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4783      to the inner-loop (which is the inner-most loop containing the dataref),
4784      and this is done be step (5) below.
4785
4786      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788      redundant.  Steps (3),(4) create the following:
4789
4790         vp0 = &base_addr;
4791         LOOP:   vp1 = phi(vp0,vp2)
4792                 ...
4793                 ...
4794                 vp2 = vp1 + step
4795                 goto LOOP
4796
4797      If there is an inner-loop nested in loop, then step (5) will also be
4798      applied, and an additional update in the inner-loop will be created:
4799
4800         vp0 = &base_addr;
4801         LOOP:   vp1 = phi(vp0,vp2)
4802                 ...
4803         inner:     vp3 = phi(vp1,vp4)
4804                    vp4 = vp3 + inner_step
4805                    if () goto inner
4806                 ...
4807                 vp2 = vp1 + step
4808                 if () goto LOOP   */
4809
4810   /* (2) Calculate the initial address of the aggregate-pointer, and set
4811      the aggregate-pointer to point to it before the loop.  */
4812
4813   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4814
4815   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4816                                                    offset, byte_offset);
4817   if (new_stmt_list)
4818     {
4819       if (pe)
4820         {
4821           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822           gcc_assert (!new_bb);
4823         }
4824       else
4825         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4826     }
4827
4828   *initial_address = new_temp;
4829   aggr_ptr_init = new_temp;
4830
4831   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833      inner-loop nested in LOOP (during outer-loop vectorization).  */
4834
4835   /* No update in loop is required.  */
4836   if (only_init && (!loop_vinfo || at_loop == loop))
4837     aptr = aggr_ptr_init;
4838   else
4839     {
4840       /* Accesses to invariant addresses should be handled specially
4841          by the caller.  */
4842       tree step = vect_dr_behavior (dr_info)->step;
4843       gcc_assert (!integer_zerop (step));
4844
4845       if (iv_step == NULL_TREE)
4846         {
4847           /* The step of the aggregate pointer is the type size,
4848              negated for downward accesses.  */
4849           iv_step = TYPE_SIZE_UNIT (aggr_type);
4850           if (tree_int_cst_sgn (step) == -1)
4851             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4852         }
4853
4854       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4855
4856       create_iv (aggr_ptr_init,
4857                  fold_convert (aggr_ptr_type, iv_step),
4858                  aggr_ptr, loop, &incr_gsi, insert_after,
4859                  &indx_before_incr, &indx_after_incr);
4860       incr = gsi_stmt (incr_gsi);
4861       loop_vinfo->add_stmt (incr);
4862
4863       /* Copy the points-to information if it exists. */
4864       if (DR_PTR_INFO (dr))
4865         {
4866           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4867           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4868         }
4869       if (ptr_incr)
4870         *ptr_incr = incr;
4871
4872       aptr = indx_before_incr;
4873     }
4874
4875   if (!nested_in_vect_loop || only_init)
4876     return aptr;
4877
4878
4879   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4880      nested in LOOP, if exists.  */
4881
4882   gcc_assert (nested_in_vect_loop);
4883   if (!only_init)
4884     {
4885       standard_iv_increment_position (containing_loop, &incr_gsi,
4886                                       &insert_after);
4887       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4888                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4889                  &indx_after_incr);
4890       incr = gsi_stmt (incr_gsi);
4891       loop_vinfo->add_stmt (incr);
4892
4893       /* Copy the points-to information if it exists. */
4894       if (DR_PTR_INFO (dr))
4895         {
4896           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4897           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4898         }
4899       if (ptr_incr)
4900         *ptr_incr = incr;
4901
4902       return indx_before_incr;
4903     }
4904   else
4905     gcc_unreachable ();
4906 }
4907
4908
4909 /* Function bump_vector_ptr
4910
4911    Increment a pointer (to a vector type) by vector-size. If requested,
4912    i.e. if PTR-INCR is given, then also connect the new increment stmt
4913    to the existing def-use update-chain of the pointer, by modifying
4914    the PTR_INCR as illustrated below:
4915
4916    The pointer def-use update-chain before this function:
4917                         DATAREF_PTR = phi (p_0, p_2)
4918                         ....
4919         PTR_INCR:       p_2 = DATAREF_PTR + step
4920
4921    The pointer def-use update-chain after this function:
4922                         DATAREF_PTR = phi (p_0, p_2)
4923                         ....
4924                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4925                         ....
4926         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4927
4928    Input:
4929    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4930                  in the loop.
4931    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4932               the loop.  The increment amount across iterations is expected
4933               to be vector_size.
4934    BSI - location where the new update stmt is to be placed.
4935    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4936    BUMP - optional. The offset by which to bump the pointer. If not given,
4937           the offset is assumed to be vector_size.
4938
4939    Output: Return NEW_DATAREF_PTR as illustrated above.
4940
4941 */
4942
4943 tree
4944 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4945                  stmt_vec_info stmt_info, tree bump)
4946 {
4947   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4948   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4949   tree update = TYPE_SIZE_UNIT (vectype);
4950   gassign *incr_stmt;
4951   ssa_op_iter iter;
4952   use_operand_p use_p;
4953   tree new_dataref_ptr;
4954
4955   if (bump)
4956     update = bump;
4957
4958   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4959     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4960   else
4961     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4962   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4963                                    dataref_ptr, update);
4964   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4965
4966   /* Copy the points-to information if it exists. */
4967   if (DR_PTR_INFO (dr))
4968     {
4969       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4970       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4971     }
4972
4973   if (!ptr_incr)
4974     return new_dataref_ptr;
4975
4976   /* Update the vector-pointer's cross-iteration increment.  */
4977   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4978     {
4979       tree use = USE_FROM_PTR (use_p);
4980
4981       if (use == dataref_ptr)
4982         SET_USE (use_p, new_dataref_ptr);
4983       else
4984         gcc_assert (operand_equal_p (use, update, 0));
4985     }
4986
4987   return new_dataref_ptr;
4988 }
4989
4990
4991 /* Copy memory reference info such as base/clique from the SRC reference
4992    to the DEST MEM_REF.  */
4993
4994 void
4995 vect_copy_ref_info (tree dest, tree src)
4996 {
4997   if (TREE_CODE (dest) != MEM_REF)
4998     return;
4999
5000   tree src_base = src;
5001   while (handled_component_p (src_base))
5002     src_base = TREE_OPERAND (src_base, 0);
5003   if (TREE_CODE (src_base) != MEM_REF
5004       && TREE_CODE (src_base) != TARGET_MEM_REF)
5005     return;
5006
5007   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5008   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5009 }
5010
5011
5012 /* Function vect_create_destination_var.
5013
5014    Create a new temporary of type VECTYPE.  */
5015
5016 tree
5017 vect_create_destination_var (tree scalar_dest, tree vectype)
5018 {
5019   tree vec_dest;
5020   const char *name;
5021   char *new_name;
5022   tree type;
5023   enum vect_var_kind kind;
5024
5025   kind = vectype
5026     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5027     ? vect_mask_var
5028     : vect_simple_var
5029     : vect_scalar_var;
5030   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5031
5032   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5033
5034   name = get_name (scalar_dest);
5035   if (name)
5036     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5037   else
5038     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5039   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5040   free (new_name);
5041
5042   return vec_dest;
5043 }
5044
5045 /* Function vect_grouped_store_supported.
5046
5047    Returns TRUE if interleave high and interleave low permutations
5048    are supported, and FALSE otherwise.  */
5049
5050 bool
5051 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5052 {
5053   machine_mode mode = TYPE_MODE (vectype);
5054
5055   /* vect_permute_store_chain requires the group size to be equal to 3 or
5056      be a power of two.  */
5057   if (count != 3 && exact_log2 (count) == -1)
5058     {
5059       if (dump_enabled_p ())
5060         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5061                          "the size of the group of accesses"
5062                          " is not a power of 2 or not eqaul to 3\n");
5063       return false;
5064     }
5065
5066   /* Check that the permutation is supported.  */
5067   if (VECTOR_MODE_P (mode))
5068     {
5069       unsigned int i;
5070       if (count == 3)
5071         {
5072           unsigned int j0 = 0, j1 = 0, j2 = 0;
5073           unsigned int i, j;
5074
5075           unsigned int nelt;
5076           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5077             {
5078               if (dump_enabled_p ())
5079                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5080                                  "cannot handle groups of 3 stores for"
5081                                  " variable-length vectors\n");
5082               return false;
5083             }
5084
5085           vec_perm_builder sel (nelt, nelt, 1);
5086           sel.quick_grow (nelt);
5087           vec_perm_indices indices;
5088           for (j = 0; j < 3; j++)
5089             {
5090               int nelt0 = ((3 - j) * nelt) % 3;
5091               int nelt1 = ((3 - j) * nelt + 1) % 3;
5092               int nelt2 = ((3 - j) * nelt + 2) % 3;
5093               for (i = 0; i < nelt; i++)
5094                 {
5095                   if (3 * i + nelt0 < nelt)
5096                     sel[3 * i + nelt0] = j0++;
5097                   if (3 * i + nelt1 < nelt)
5098                     sel[3 * i + nelt1] = nelt + j1++;
5099                   if (3 * i + nelt2 < nelt)
5100                     sel[3 * i + nelt2] = 0;
5101                 }
5102               indices.new_vector (sel, 2, nelt);
5103               if (!can_vec_perm_const_p (mode, indices))
5104                 {
5105                   if (dump_enabled_p ())
5106                     dump_printf (MSG_MISSED_OPTIMIZATION,
5107                                  "permutation op not supported by target.\n");
5108                   return false;
5109                 }
5110
5111               for (i = 0; i < nelt; i++)
5112                 {
5113                   if (3 * i + nelt0 < nelt)
5114                     sel[3 * i + nelt0] = 3 * i + nelt0;
5115                   if (3 * i + nelt1 < nelt)
5116                     sel[3 * i + nelt1] = 3 * i + nelt1;
5117                   if (3 * i + nelt2 < nelt)
5118                     sel[3 * i + nelt2] = nelt + j2++;
5119                 }
5120               indices.new_vector (sel, 2, nelt);
5121               if (!can_vec_perm_const_p (mode, indices))
5122                 {
5123                   if (dump_enabled_p ())
5124                     dump_printf (MSG_MISSED_OPTIMIZATION,
5125                                  "permutation op not supported by target.\n");
5126                   return false;
5127                 }
5128             }
5129           return true;
5130         }
5131       else
5132         {
5133           /* If length is not equal to 3 then only power of 2 is supported.  */
5134           gcc_assert (pow2p_hwi (count));
5135           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5136
5137           /* The encoding has 2 interleaved stepped patterns.  */
5138           vec_perm_builder sel (nelt, 2, 3);
5139           sel.quick_grow (6);
5140           for (i = 0; i < 3; i++)
5141             {
5142               sel[i * 2] = i;
5143               sel[i * 2 + 1] = i + nelt;
5144             }
5145           vec_perm_indices indices (sel, 2, nelt);
5146           if (can_vec_perm_const_p (mode, indices))
5147             {
5148               for (i = 0; i < 6; i++)
5149                 sel[i] += exact_div (nelt, 2);
5150               indices.new_vector (sel, 2, nelt);
5151               if (can_vec_perm_const_p (mode, indices))
5152                 return true;
5153             }
5154         }
5155     }
5156
5157   if (dump_enabled_p ())
5158     dump_printf (MSG_MISSED_OPTIMIZATION,
5159                  "permutaion op not supported by target.\n");
5160   return false;
5161 }
5162
5163
5164 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5165    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5166
5167 bool
5168 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5169                             bool masked_p)
5170 {
5171   if (masked_p)
5172     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5173                                          vec_mask_store_lanes_optab,
5174                                          vectype, count);
5175   else
5176     return vect_lanes_optab_supported_p ("vec_store_lanes",
5177                                          vec_store_lanes_optab,
5178                                          vectype, count);
5179 }
5180
5181
5182 /* Function vect_permute_store_chain.
5183
5184    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5185    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5186    the data correctly for the stores.  Return the final references for stores
5187    in RESULT_CHAIN.
5188
5189    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5190    The input is 4 vectors each containing 8 elements.  We assign a number to
5191    each element, the input sequence is:
5192
5193    1st vec:   0  1  2  3  4  5  6  7
5194    2nd vec:   8  9 10 11 12 13 14 15
5195    3rd vec:  16 17 18 19 20 21 22 23
5196    4th vec:  24 25 26 27 28 29 30 31
5197
5198    The output sequence should be:
5199
5200    1st vec:  0  8 16 24  1  9 17 25
5201    2nd vec:  2 10 18 26  3 11 19 27
5202    3rd vec:  4 12 20 28  5 13 21 30
5203    4th vec:  6 14 22 30  7 15 23 31
5204
5205    i.e., we interleave the contents of the four vectors in their order.
5206
5207    We use interleave_high/low instructions to create such output.  The input of
5208    each interleave_high/low operation is two vectors:
5209    1st vec    2nd vec
5210    0 1 2 3    4 5 6 7
5211    the even elements of the result vector are obtained left-to-right from the
5212    high/low elements of the first vector.  The odd elements of the result are
5213    obtained left-to-right from the high/low elements of the second vector.
5214    The output of interleave_high will be:   0 4 1 5
5215    and of interleave_low:                   2 6 3 7
5216
5217
5218    The permutation is done in log LENGTH stages.  In each stage interleave_high
5219    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5220    where the first argument is taken from the first half of DR_CHAIN and the
5221    second argument from it's second half.
5222    In our example,
5223
5224    I1: interleave_high (1st vec, 3rd vec)
5225    I2: interleave_low (1st vec, 3rd vec)
5226    I3: interleave_high (2nd vec, 4th vec)
5227    I4: interleave_low (2nd vec, 4th vec)
5228
5229    The output for the first stage is:
5230
5231    I1:  0 16  1 17  2 18  3 19
5232    I2:  4 20  5 21  6 22  7 23
5233    I3:  8 24  9 25 10 26 11 27
5234    I4: 12 28 13 29 14 30 15 31
5235
5236    The output of the second stage, i.e. the final result is:
5237
5238    I1:  0  8 16 24  1  9 17 25
5239    I2:  2 10 18 26  3 11 19 27
5240    I3:  4 12 20 28  5 13 21 30
5241    I4:  6 14 22 30  7 15 23 31.  */
5242
5243 void
5244 vect_permute_store_chain (vec<tree> dr_chain,
5245                           unsigned int length,
5246                           stmt_vec_info stmt_info,
5247                           gimple_stmt_iterator *gsi,
5248                           vec<tree> *result_chain)
5249 {
5250   tree vect1, vect2, high, low;
5251   gimple *perm_stmt;
5252   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5253   tree perm_mask_low, perm_mask_high;
5254   tree data_ref;
5255   tree perm3_mask_low, perm3_mask_high;
5256   unsigned int i, j, n, log_length = exact_log2 (length);
5257
5258   result_chain->quick_grow (length);
5259   memcpy (result_chain->address (), dr_chain.address (),
5260           length * sizeof (tree));
5261
5262   if (length == 3)
5263     {
5264       /* vect_grouped_store_supported ensures that this is constant.  */
5265       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5266       unsigned int j0 = 0, j1 = 0, j2 = 0;
5267
5268       vec_perm_builder sel (nelt, nelt, 1);
5269       sel.quick_grow (nelt);
5270       vec_perm_indices indices;
5271       for (j = 0; j < 3; j++)
5272         {
5273           int nelt0 = ((3 - j) * nelt) % 3;
5274           int nelt1 = ((3 - j) * nelt + 1) % 3;
5275           int nelt2 = ((3 - j) * nelt + 2) % 3;
5276
5277           for (i = 0; i < nelt; i++)
5278             {
5279               if (3 * i + nelt0 < nelt)
5280                 sel[3 * i + nelt0] = j0++;
5281               if (3 * i + nelt1 < nelt)
5282                 sel[3 * i + nelt1] = nelt + j1++;
5283               if (3 * i + nelt2 < nelt)
5284                 sel[3 * i + nelt2] = 0;
5285             }
5286           indices.new_vector (sel, 2, nelt);
5287           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5288
5289           for (i = 0; i < nelt; i++)
5290             {
5291               if (3 * i + nelt0 < nelt)
5292                 sel[3 * i + nelt0] = 3 * i + nelt0;
5293               if (3 * i + nelt1 < nelt)
5294                 sel[3 * i + nelt1] = 3 * i + nelt1;
5295               if (3 * i + nelt2 < nelt)
5296                 sel[3 * i + nelt2] = nelt + j2++;
5297             }
5298           indices.new_vector (sel, 2, nelt);
5299           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5300
5301           vect1 = dr_chain[0];
5302           vect2 = dr_chain[1];
5303
5304           /* Create interleaving stmt:
5305              low = VEC_PERM_EXPR <vect1, vect2,
5306                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5307                                    j + 2, nelt + j + 2, *, ...}>  */
5308           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5309           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5310                                            vect2, perm3_mask_low);
5311           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5312
5313           vect1 = data_ref;
5314           vect2 = dr_chain[2];
5315           /* Create interleaving stmt:
5316              low = VEC_PERM_EXPR <vect1, vect2,
5317                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5318                                    6, 7, nelt + j + 2, ...}>  */
5319           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5320           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5321                                            vect2, perm3_mask_high);
5322           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5323           (*result_chain)[j] = data_ref;
5324         }
5325     }
5326   else
5327     {
5328       /* If length is not equal to 3 then only power of 2 is supported.  */
5329       gcc_assert (pow2p_hwi (length));
5330
5331       /* The encoding has 2 interleaved stepped patterns.  */
5332       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5333       vec_perm_builder sel (nelt, 2, 3);
5334       sel.quick_grow (6);
5335       for (i = 0; i < 3; i++)
5336         {
5337           sel[i * 2] = i;
5338           sel[i * 2 + 1] = i + nelt;
5339         }
5340         vec_perm_indices indices (sel, 2, nelt);
5341         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342
5343         for (i = 0; i < 6; i++)
5344           sel[i] += exact_div (nelt, 2);
5345         indices.new_vector (sel, 2, nelt);
5346         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5347
5348         for (i = 0, n = log_length; i < n; i++)
5349           {
5350             for (j = 0; j < length/2; j++)
5351               {
5352                 vect1 = dr_chain[j];
5353                 vect2 = dr_chain[j+length/2];
5354
5355                 /* Create interleaving stmt:
5356                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5357                                                         ...}>  */
5358                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5359                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5360                                                  vect2, perm_mask_high);
5361                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5362                 (*result_chain)[2*j] = high;
5363
5364                 /* Create interleaving stmt:
5365                    low = VEC_PERM_EXPR <vect1, vect2,
5366                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5367                                          ...}>  */
5368                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5369                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5370                                                  vect2, perm_mask_low);
5371                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5372                 (*result_chain)[2*j+1] = low;
5373               }
5374             memcpy (dr_chain.address (), result_chain->address (),
5375                     length * sizeof (tree));
5376           }
5377     }
5378 }
5379
5380 /* Function vect_setup_realignment
5381
5382    This function is called when vectorizing an unaligned load using
5383    the dr_explicit_realign[_optimized] scheme.
5384    This function generates the following code at the loop prolog:
5385
5386       p = initial_addr;
5387    x  msq_init = *(floor(p));   # prolog load
5388       realignment_token = call target_builtin;
5389     loop:
5390    x  msq = phi (msq_init, ---)
5391
5392    The stmts marked with x are generated only for the case of
5393    dr_explicit_realign_optimized.
5394
5395    The code above sets up a new (vector) pointer, pointing to the first
5396    location accessed by STMT_INFO, and a "floor-aligned" load using that
5397    pointer.  It also generates code to compute the "realignment-token"
5398    (if the relevant target hook was defined), and creates a phi-node at the
5399    loop-header bb whose arguments are the result of the prolog-load (created
5400    by this function) and the result of a load that takes place in the loop
5401    (to be created by the caller to this function).
5402
5403    For the case of dr_explicit_realign_optimized:
5404    The caller to this function uses the phi-result (msq) to create the
5405    realignment code inside the loop, and sets up the missing phi argument,
5406    as follows:
5407     loop:
5408       msq = phi (msq_init, lsq)
5409       lsq = *(floor(p'));        # load in loop
5410       result = realign_load (msq, lsq, realignment_token);
5411
5412    For the case of dr_explicit_realign:
5413     loop:
5414       msq = *(floor(p));        # load in loop
5415       p' = p + (VS-1);
5416       lsq = *(floor(p'));       # load in loop
5417       result = realign_load (msq, lsq, realignment_token);
5418
5419    Input:
5420    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5421                a memory location that may be unaligned.
5422    BSI - place where new code is to be inserted.
5423    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5424                               is used.
5425
5426    Output:
5427    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5428                        target hook, if defined.
5429    Return value - the result of the loop-header phi node.  */
5430
5431 tree
5432 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5433                         tree *realignment_token,
5434                         enum dr_alignment_support alignment_support_scheme,
5435                         tree init_addr,
5436                         struct loop **at_loop)
5437 {
5438   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5441   struct data_reference *dr = dr_info->dr;
5442   struct loop *loop = NULL;
5443   edge pe = NULL;
5444   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5445   tree vec_dest;
5446   gimple *inc;
5447   tree ptr;
5448   tree data_ref;
5449   basic_block new_bb;
5450   tree msq_init = NULL_TREE;
5451   tree new_temp;
5452   gphi *phi_stmt;
5453   tree msq = NULL_TREE;
5454   gimple_seq stmts = NULL;
5455   bool compute_in_loop = false;
5456   bool nested_in_vect_loop = false;
5457   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5458   struct loop *loop_for_initial_load = NULL;
5459
5460   if (loop_vinfo)
5461     {
5462       loop = LOOP_VINFO_LOOP (loop_vinfo);
5463       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5464     }
5465
5466   gcc_assert (alignment_support_scheme == dr_explicit_realign
5467               || alignment_support_scheme == dr_explicit_realign_optimized);
5468
5469   /* We need to generate three things:
5470      1. the misalignment computation
5471      2. the extra vector load (for the optimized realignment scheme).
5472      3. the phi node for the two vectors from which the realignment is
5473       done (for the optimized realignment scheme).  */
5474
5475   /* 1. Determine where to generate the misalignment computation.
5476
5477      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478      calculation will be generated by this function, outside the loop (in the
5479      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5480      caller, inside the loop.
5481
5482      Background: If the misalignment remains fixed throughout the iterations of
5483      the loop, then both realignment schemes are applicable, and also the
5484      misalignment computation can be done outside LOOP.  This is because we are
5485      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486      are a multiple of VS (the Vector Size), and therefore the misalignment in
5487      different vectorized LOOP iterations is always the same.
5488      The problem arises only if the memory access is in an inner-loop nested
5489      inside LOOP, which is now being vectorized using outer-loop vectorization.
5490      This is the only case when the misalignment of the memory access may not
5491      remain fixed throughout the iterations of the inner-loop (as explained in
5492      detail in vect_supportable_dr_alignment).  In this case, not only is the
5493      optimized realignment scheme not applicable, but also the misalignment
5494      computation (and generation of the realignment token that is passed to
5495      REALIGN_LOAD) have to be done inside the loop.
5496
5497      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498      or not, which in turn determines if the misalignment is computed inside
5499      the inner-loop, or outside LOOP.  */
5500
5501   if (init_addr != NULL_TREE || !loop_vinfo)
5502     {
5503       compute_in_loop = true;
5504       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5505     }
5506
5507
5508   /* 2. Determine where to generate the extra vector load.
5509
5510      For the optimized realignment scheme, instead of generating two vector
5511      loads in each iteration, we generate a single extra vector load in the
5512      preheader of the loop, and in each iteration reuse the result of the
5513      vector load from the previous iteration.  In case the memory access is in
5514      an inner-loop nested inside LOOP, which is now being vectorized using
5515      outer-loop vectorization, we need to determine whether this initial vector
5516      load should be generated at the preheader of the inner-loop, or can be
5517      generated at the preheader of LOOP.  If the memory access has no evolution
5518      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519      to be generated inside LOOP (in the preheader of the inner-loop).  */
5520
5521   if (nested_in_vect_loop)
5522     {
5523       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524       bool invariant_in_outerloop =
5525             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5527     }
5528   else
5529     loop_for_initial_load = loop;
5530   if (at_loop)
5531     *at_loop = loop_for_initial_load;
5532
5533   if (loop_for_initial_load)
5534     pe = loop_preheader_edge (loop_for_initial_load);
5535
5536   /* 3. For the case of the optimized realignment, create the first vector
5537       load at the loop preheader.  */
5538
5539   if (alignment_support_scheme == dr_explicit_realign_optimized)
5540     {
5541       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5542       gassign *new_stmt;
5543
5544       gcc_assert (!compute_in_loop);
5545       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5547                                       loop_for_initial_load, NULL_TREE,
5548                                       &init_addr, NULL, &inc, true);
5549       if (TREE_CODE (ptr) == SSA_NAME)
5550         new_temp = copy_ssa_name (ptr);
5551       else
5552         new_temp = make_ssa_name (TREE_TYPE (ptr));
5553       unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5554       new_stmt = gimple_build_assign
5555                    (new_temp, BIT_AND_EXPR, ptr,
5556                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558       gcc_assert (!new_bb);
5559       data_ref
5560         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562       vect_copy_ref_info (data_ref, DR_REF (dr));
5563       new_stmt = gimple_build_assign (vec_dest, data_ref);
5564       new_temp = make_ssa_name (vec_dest, new_stmt);
5565       gimple_assign_set_lhs (new_stmt, new_temp);
5566       if (pe)
5567         {
5568           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5569           gcc_assert (!new_bb);
5570         }
5571       else
5572          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5573
5574       msq_init = gimple_assign_lhs (new_stmt);
5575     }
5576
5577   /* 4. Create realignment token using a target builtin, if available.
5578       It is done either inside the containing loop, or before LOOP (as
5579       determined above).  */
5580
5581   if (targetm.vectorize.builtin_mask_for_load)
5582     {
5583       gcall *new_stmt;
5584       tree builtin_decl;
5585
5586       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5587       if (!init_addr)
5588         {
5589           /* Generate the INIT_ADDR computation outside LOOP.  */
5590           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5591                                                             NULL_TREE);
5592           if (loop)
5593             {
5594               pe = loop_preheader_edge (loop);
5595               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5596               gcc_assert (!new_bb);
5597             }
5598           else
5599              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5600         }
5601
5602       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5603       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5604       vec_dest =
5605         vect_create_destination_var (scalar_dest,
5606                                      gimple_call_return_type (new_stmt));
5607       new_temp = make_ssa_name (vec_dest, new_stmt);
5608       gimple_call_set_lhs (new_stmt, new_temp);
5609
5610       if (compute_in_loop)
5611         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5612       else
5613         {
5614           /* Generate the misalignment computation outside LOOP.  */
5615           pe = loop_preheader_edge (loop);
5616           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5617           gcc_assert (!new_bb);
5618         }
5619
5620       *realignment_token = gimple_call_lhs (new_stmt);
5621
5622       /* The result of the CALL_EXPR to this builtin is determined from
5623          the value of the parameter and no global variables are touched
5624          which makes the builtin a "const" function.  Requiring the
5625          builtin to have the "const" attribute makes it unnecessary
5626          to call mark_call_clobbered.  */
5627       gcc_assert (TREE_READONLY (builtin_decl));
5628     }
5629
5630   if (alignment_support_scheme == dr_explicit_realign)
5631     return msq;
5632
5633   gcc_assert (!compute_in_loop);
5634   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5635
5636
5637   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5638
5639   pe = loop_preheader_edge (containing_loop);
5640   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5641   msq = make_ssa_name (vec_dest);
5642   phi_stmt = create_phi_node (msq, containing_loop->header);
5643   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5644
5645   return msq;
5646 }
5647
5648
5649 /* Function vect_grouped_load_supported.
5650
5651    COUNT is the size of the load group (the number of statements plus the
5652    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5653    only one statement, with a gap of COUNT - 1.
5654
5655    Returns true if a suitable permute exists.  */
5656
5657 bool
5658 vect_grouped_load_supported (tree vectype, bool single_element_p,
5659                              unsigned HOST_WIDE_INT count)
5660 {
5661   machine_mode mode = TYPE_MODE (vectype);
5662
5663   /* If this is single-element interleaving with an element distance
5664      that leaves unused vector loads around punt - we at least create
5665      very sub-optimal code in that case (and blow up memory,
5666      see PR65518).  */
5667   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5668     {
5669       if (dump_enabled_p ())
5670         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671                          "single-element interleaving not supported "
5672                          "for not adjacent vector loads\n");
5673       return false;
5674     }
5675
5676   /* vect_permute_load_chain requires the group size to be equal to 3 or
5677      be a power of two.  */
5678   if (count != 3 && exact_log2 (count) == -1)
5679     {
5680       if (dump_enabled_p ())
5681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682                          "the size of the group of accesses"
5683                          " is not a power of 2 or not equal to 3\n");
5684       return false;
5685     }
5686
5687   /* Check that the permutation is supported.  */
5688   if (VECTOR_MODE_P (mode))
5689     {
5690       unsigned int i, j;
5691       if (count == 3)
5692         {
5693           unsigned int nelt;
5694           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5695             {
5696               if (dump_enabled_p ())
5697                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698                                  "cannot handle groups of 3 loads for"
5699                                  " variable-length vectors\n");
5700               return false;
5701             }
5702
5703           vec_perm_builder sel (nelt, nelt, 1);
5704           sel.quick_grow (nelt);
5705           vec_perm_indices indices;
5706           unsigned int k;
5707           for (k = 0; k < 3; k++)
5708             {
5709               for (i = 0; i < nelt; i++)
5710                 if (3 * i + k < 2 * nelt)
5711                   sel[i] = 3 * i + k;
5712                 else
5713                   sel[i] = 0;
5714               indices.new_vector (sel, 2, nelt);
5715               if (!can_vec_perm_const_p (mode, indices))
5716                 {
5717                   if (dump_enabled_p ())
5718                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719                                      "shuffle of 3 loads is not supported by"
5720                                      " target\n");
5721                   return false;
5722                 }
5723               for (i = 0, j = 0; i < nelt; i++)
5724                 if (3 * i + k < 2 * nelt)
5725                   sel[i] = i;
5726                 else
5727                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5728               indices.new_vector (sel, 2, nelt);
5729               if (!can_vec_perm_const_p (mode, indices))
5730                 {
5731                   if (dump_enabled_p ())
5732                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733                                      "shuffle of 3 loads is not supported by"
5734                                      " target\n");
5735                   return false;
5736                 }
5737             }
5738           return true;
5739         }
5740       else
5741         {
5742           /* If length is not equal to 3 then only power of 2 is supported.  */
5743           gcc_assert (pow2p_hwi (count));
5744           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5745
5746           /* The encoding has a single stepped pattern.  */
5747           vec_perm_builder sel (nelt, 1, 3);
5748           sel.quick_grow (3);
5749           for (i = 0; i < 3; i++)
5750             sel[i] = i * 2;
5751           vec_perm_indices indices (sel, 2, nelt);
5752           if (can_vec_perm_const_p (mode, indices))
5753             {
5754               for (i = 0; i < 3; i++)
5755                 sel[i] = i * 2 + 1;
5756               indices.new_vector (sel, 2, nelt);
5757               if (can_vec_perm_const_p (mode, indices))
5758                 return true;
5759             }
5760         }
5761     }
5762
5763   if (dump_enabled_p ())
5764     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765                      "extract even/odd not supported by target\n");
5766   return false;
5767 }
5768
5769 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5770    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5771
5772 bool
5773 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5774                            bool masked_p)
5775 {
5776   if (masked_p)
5777     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5778                                          vec_mask_load_lanes_optab,
5779                                          vectype, count);
5780   else
5781     return vect_lanes_optab_supported_p ("vec_load_lanes",
5782                                          vec_load_lanes_optab,
5783                                          vectype, count);
5784 }
5785
5786 /* Function vect_permute_load_chain.
5787
5788    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5789    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5790    the input data correctly.  Return the final references for loads in
5791    RESULT_CHAIN.
5792
5793    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5794    The input is 4 vectors each containing 8 elements. We assign a number to each
5795    element, the input sequence is:
5796
5797    1st vec:   0  1  2  3  4  5  6  7
5798    2nd vec:   8  9 10 11 12 13 14 15
5799    3rd vec:  16 17 18 19 20 21 22 23
5800    4th vec:  24 25 26 27 28 29 30 31
5801
5802    The output sequence should be:
5803
5804    1st vec:  0 4  8 12 16 20 24 28
5805    2nd vec:  1 5  9 13 17 21 25 29
5806    3rd vec:  2 6 10 14 18 22 26 30
5807    4th vec:  3 7 11 15 19 23 27 31
5808
5809    i.e., the first output vector should contain the first elements of each
5810    interleaving group, etc.
5811
5812    We use extract_even/odd instructions to create such output.  The input of
5813    each extract_even/odd operation is two vectors
5814    1st vec    2nd vec
5815    0 1 2 3    4 5 6 7
5816
5817    and the output is the vector of extracted even/odd elements.  The output of
5818    extract_even will be:   0 2 4 6
5819    and of extract_odd:     1 3 5 7
5820
5821
5822    The permutation is done in log LENGTH stages.  In each stage extract_even
5823    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5824    their order.  In our example,
5825
5826    E1: extract_even (1st vec, 2nd vec)
5827    E2: extract_odd (1st vec, 2nd vec)
5828    E3: extract_even (3rd vec, 4th vec)
5829    E4: extract_odd (3rd vec, 4th vec)
5830
5831    The output for the first stage will be:
5832
5833    E1:  0  2  4  6  8 10 12 14
5834    E2:  1  3  5  7  9 11 13 15
5835    E3: 16 18 20 22 24 26 28 30
5836    E4: 17 19 21 23 25 27 29 31
5837
5838    In order to proceed and create the correct sequence for the next stage (or
5839    for the correct output, if the second stage is the last one, as in our
5840    example), we first put the output of extract_even operation and then the
5841    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5842    The input for the second stage is:
5843
5844    1st vec (E1):  0  2  4  6  8 10 12 14
5845    2nd vec (E3): 16 18 20 22 24 26 28 30
5846    3rd vec (E2):  1  3  5  7  9 11 13 15
5847    4th vec (E4): 17 19 21 23 25 27 29 31
5848
5849    The output of the second stage:
5850
5851    E1: 0 4  8 12 16 20 24 28
5852    E2: 2 6 10 14 18 22 26 30
5853    E3: 1 5  9 13 17 21 25 29
5854    E4: 3 7 11 15 19 23 27 31
5855
5856    And RESULT_CHAIN after reordering:
5857
5858    1st vec (E1):  0 4  8 12 16 20 24 28
5859    2nd vec (E3):  1 5  9 13 17 21 25 29
5860    3rd vec (E2):  2 6 10 14 18 22 26 30
5861    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5862
5863 static void
5864 vect_permute_load_chain (vec<tree> dr_chain,
5865                          unsigned int length,
5866                          stmt_vec_info stmt_info,
5867                          gimple_stmt_iterator *gsi,
5868                          vec<tree> *result_chain)
5869 {
5870   tree data_ref, first_vect, second_vect;
5871   tree perm_mask_even, perm_mask_odd;
5872   tree perm3_mask_low, perm3_mask_high;
5873   gimple *perm_stmt;
5874   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5875   unsigned int i, j, log_length = exact_log2 (length);
5876
5877   result_chain->quick_grow (length);
5878   memcpy (result_chain->address (), dr_chain.address (),
5879           length * sizeof (tree));
5880
5881   if (length == 3)
5882     {
5883       /* vect_grouped_load_supported ensures that this is constant.  */
5884       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5885       unsigned int k;
5886
5887       vec_perm_builder sel (nelt, nelt, 1);
5888       sel.quick_grow (nelt);
5889       vec_perm_indices indices;
5890       for (k = 0; k < 3; k++)
5891         {
5892           for (i = 0; i < nelt; i++)
5893             if (3 * i + k < 2 * nelt)
5894               sel[i] = 3 * i + k;
5895             else
5896               sel[i] = 0;
5897           indices.new_vector (sel, 2, nelt);
5898           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5899
5900           for (i = 0, j = 0; i < nelt; i++)
5901             if (3 * i + k < 2 * nelt)
5902               sel[i] = i;
5903             else
5904               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5905           indices.new_vector (sel, 2, nelt);
5906           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5907
5908           first_vect = dr_chain[0];
5909           second_vect = dr_chain[1];
5910
5911           /* Create interleaving stmt (low part of):
5912              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5913                                                              ...}>  */
5914           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5915           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5916                                            second_vect, perm3_mask_low);
5917           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5918
5919           /* Create interleaving stmt (high part of):
5920              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921                                                               ...}>  */
5922           first_vect = data_ref;
5923           second_vect = dr_chain[2];
5924           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5925           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5926                                            second_vect, perm3_mask_high);
5927           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5928           (*result_chain)[k] = data_ref;
5929         }
5930     }
5931   else
5932     {
5933       /* If length is not equal to 3 then only power of 2 is supported.  */
5934       gcc_assert (pow2p_hwi (length));
5935
5936       /* The encoding has a single stepped pattern.  */
5937       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5938       vec_perm_builder sel (nelt, 1, 3);
5939       sel.quick_grow (3);
5940       for (i = 0; i < 3; ++i)
5941         sel[i] = i * 2;
5942       vec_perm_indices indices (sel, 2, nelt);
5943       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5944
5945       for (i = 0; i < 3; ++i)
5946         sel[i] = i * 2 + 1;
5947       indices.new_vector (sel, 2, nelt);
5948       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5949
5950       for (i = 0; i < log_length; i++)
5951         {
5952           for (j = 0; j < length; j += 2)
5953             {
5954               first_vect = dr_chain[j];
5955               second_vect = dr_chain[j+1];
5956
5957               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5958               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5959               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5960                                                first_vect, second_vect,
5961                                                perm_mask_even);
5962               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5963               (*result_chain)[j/2] = data_ref;
5964
5965               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5966               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5967               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5968                                                first_vect, second_vect,
5969                                                perm_mask_odd);
5970               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5971               (*result_chain)[j/2+length/2] = data_ref;
5972             }
5973           memcpy (dr_chain.address (), result_chain->address (),
5974                   length * sizeof (tree));
5975         }
5976     }
5977 }
5978
5979 /* Function vect_shift_permute_load_chain.
5980
5981    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5982    sequence of stmts to reorder the input data accordingly.
5983    Return the final references for loads in RESULT_CHAIN.
5984    Return true if successed, false otherwise.
5985
5986    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5987    The input is 3 vectors each containing 8 elements.  We assign a
5988    number to each element, the input sequence is:
5989
5990    1st vec:   0  1  2  3  4  5  6  7
5991    2nd vec:   8  9 10 11 12 13 14 15
5992    3rd vec:  16 17 18 19 20 21 22 23
5993
5994    The output sequence should be:
5995
5996    1st vec:  0 3 6  9 12 15 18 21
5997    2nd vec:  1 4 7 10 13 16 19 22
5998    3rd vec:  2 5 8 11 14 17 20 23
5999
6000    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6001
6002    First we shuffle all 3 vectors to get correct elements order:
6003
6004    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6005    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6006    3rd vec:  (16 19 22) (17 20 23) (18 21)
6007
6008    Next we unite and shift vector 3 times:
6009
6010    1st step:
6011      shift right by 6 the concatenation of:
6012      "1st vec" and  "2nd vec"
6013        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6014      "2nd vec" and  "3rd vec"
6015        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6016      "3rd vec" and  "1st vec"
6017        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6018                              | New vectors                   |
6019
6020      So that now new vectors are:
6021
6022      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6023      2nd vec:  (10 13) (16 19 22) (17 20 23)
6024      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6025
6026    2nd step:
6027      shift right by 5 the concatenation of:
6028      "1st vec" and  "3rd vec"
6029        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6030      "2nd vec" and  "1st vec"
6031        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6032      "3rd vec" and  "2nd vec"
6033        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6034                           | New vectors                   |
6035
6036      So that now new vectors are:
6037
6038      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6039      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6040      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6041
6042    3rd step:
6043      shift right by 5 the concatenation of:
6044      "1st vec" and  "1st vec"
6045        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6046      shift right by 3 the concatenation of:
6047      "2nd vec" and  "2nd vec"
6048                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6049                           | New vectors                   |
6050
6051      So that now all vectors are READY:
6052      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6053      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6054      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6055
6056    This algorithm is faster than one in vect_permute_load_chain if:
6057      1.  "shift of a concatination" is faster than general permutation.
6058          This is usually so.
6059      2.  The TARGET machine can't execute vector instructions in parallel.
6060          This is because each step of the algorithm depends on previous.
6061          The algorithm in vect_permute_load_chain is much more parallel.
6062
6063    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6064 */
6065
6066 static bool
6067 vect_shift_permute_load_chain (vec<tree> dr_chain,
6068                                unsigned int length,
6069                                stmt_vec_info stmt_info,
6070                                gimple_stmt_iterator *gsi,
6071                                vec<tree> *result_chain)
6072 {
6073   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6074   tree perm2_mask1, perm2_mask2, perm3_mask;
6075   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6076   gimple *perm_stmt;
6077
6078   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6079   unsigned int i;
6080   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6081
6082   unsigned HOST_WIDE_INT nelt, vf;
6083   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6084       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6085     /* Not supported for variable-length vectors.  */
6086     return false;
6087
6088   vec_perm_builder sel (nelt, nelt, 1);
6089   sel.quick_grow (nelt);
6090
6091   result_chain->quick_grow (length);
6092   memcpy (result_chain->address (), dr_chain.address (),
6093           length * sizeof (tree));
6094
6095   if (pow2p_hwi (length) && vf > 4)
6096     {
6097       unsigned int j, log_length = exact_log2 (length);
6098       for (i = 0; i < nelt / 2; ++i)
6099         sel[i] = i * 2;
6100       for (i = 0; i < nelt / 2; ++i)
6101         sel[nelt / 2 + i] = i * 2 + 1;
6102       vec_perm_indices indices (sel, 2, nelt);
6103       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6104         {
6105           if (dump_enabled_p ())
6106             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6107                              "shuffle of 2 fields structure is not \
6108                               supported by target\n");
6109           return false;
6110         }
6111       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6112
6113       for (i = 0; i < nelt / 2; ++i)
6114         sel[i] = i * 2 + 1;
6115       for (i = 0; i < nelt / 2; ++i)
6116         sel[nelt / 2 + i] = i * 2;
6117       indices.new_vector (sel, 2, nelt);
6118       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6119         {
6120           if (dump_enabled_p ())
6121             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6122                              "shuffle of 2 fields structure is not \
6123                               supported by target\n");
6124           return false;
6125         }
6126       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6127
6128       /* Generating permutation constant to shift all elements.
6129          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6130       for (i = 0; i < nelt; i++)
6131         sel[i] = nelt / 2 + i;
6132       indices.new_vector (sel, 2, nelt);
6133       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6134         {
6135           if (dump_enabled_p ())
6136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137                              "shift permutation is not supported by target\n");
6138           return false;
6139         }
6140       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6141
6142       /* Generating permutation constant to select vector from 2.
6143          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6144       for (i = 0; i < nelt / 2; i++)
6145         sel[i] = i;
6146       for (i = nelt / 2; i < nelt; i++)
6147         sel[i] = nelt + i;
6148       indices.new_vector (sel, 2, nelt);
6149       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6150         {
6151           if (dump_enabled_p ())
6152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153                              "select is not supported by target\n");
6154           return false;
6155         }
6156       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6157
6158       for (i = 0; i < log_length; i++)
6159         {
6160           for (j = 0; j < length; j += 2)
6161             {
6162               first_vect = dr_chain[j];
6163               second_vect = dr_chain[j + 1];
6164
6165               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6166               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6167                                                first_vect, first_vect,
6168                                                perm2_mask1);
6169               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6170               vect[0] = data_ref;
6171
6172               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6173               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6174                                                second_vect, second_vect,
6175                                                perm2_mask2);
6176               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6177               vect[1] = data_ref;
6178
6179               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6180               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181                                                vect[0], vect[1], shift1_mask);
6182               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6183               (*result_chain)[j/2 + length/2] = data_ref;
6184
6185               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6186               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6187                                                vect[0], vect[1], select_mask);
6188               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6189               (*result_chain)[j/2] = data_ref;
6190             }
6191           memcpy (dr_chain.address (), result_chain->address (),
6192                   length * sizeof (tree));
6193         }
6194       return true;
6195     }
6196   if (length == 3 && vf > 2)
6197     {
6198       unsigned int k = 0, l = 0;
6199
6200       /* Generating permutation constant to get all elements in rigth order.
6201          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6202       for (i = 0; i < nelt; i++)
6203         {
6204           if (3 * k + (l % 3) >= nelt)
6205             {
6206               k = 0;
6207               l += (3 - (nelt % 3));
6208             }
6209           sel[i] = 3 * k + (l % 3);
6210           k++;
6211         }
6212       vec_perm_indices indices (sel, 2, nelt);
6213       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6214         {
6215           if (dump_enabled_p ())
6216             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6217                              "shuffle of 3 fields structure is not \
6218                               supported by target\n");
6219           return false;
6220         }
6221       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6222
6223       /* Generating permutation constant to shift all elements.
6224          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6225       for (i = 0; i < nelt; i++)
6226         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6227       indices.new_vector (sel, 2, nelt);
6228       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6229         {
6230           if (dump_enabled_p ())
6231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232                              "shift permutation is not supported by target\n");
6233           return false;
6234         }
6235       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6236
6237       /* Generating permutation constant to shift all elements.
6238          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6239       for (i = 0; i < nelt; i++)
6240         sel[i] = 2 * (nelt / 3) + 1 + i;
6241       indices.new_vector (sel, 2, nelt);
6242       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6243         {
6244           if (dump_enabled_p ())
6245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6246                              "shift permutation is not supported by target\n");
6247           return false;
6248         }
6249       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6250
6251       /* Generating permutation constant to shift all elements.
6252          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6253       for (i = 0; i < nelt; i++)
6254         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6255       indices.new_vector (sel, 2, nelt);
6256       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6257         {
6258           if (dump_enabled_p ())
6259             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260                              "shift permutation is not supported by target\n");
6261           return false;
6262         }
6263       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6264
6265       /* Generating permutation constant to shift all elements.
6266          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6267       for (i = 0; i < nelt; i++)
6268         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6269       indices.new_vector (sel, 2, nelt);
6270       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6271         {
6272           if (dump_enabled_p ())
6273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6274                              "shift permutation is not supported by target\n");
6275           return false;
6276         }
6277       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6278
6279       for (k = 0; k < 3; k++)
6280         {
6281           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6282           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6283                                            dr_chain[k], dr_chain[k],
6284                                            perm3_mask);
6285           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6286           vect[k] = data_ref;
6287         }
6288
6289       for (k = 0; k < 3; k++)
6290         {
6291           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6292           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6293                                            vect[k % 3], vect[(k + 1) % 3],
6294                                            shift1_mask);
6295           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6296           vect_shift[k] = data_ref;
6297         }
6298
6299       for (k = 0; k < 3; k++)
6300         {
6301           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6302           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303                                            vect_shift[(4 - k) % 3],
6304                                            vect_shift[(3 - k) % 3],
6305                                            shift2_mask);
6306           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6307           vect[k] = data_ref;
6308         }
6309
6310       (*result_chain)[3 - (nelt % 3)] = vect[2];
6311
6312       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6313       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6314                                        vect[0], shift3_mask);
6315       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6316       (*result_chain)[nelt % 3] = data_ref;
6317
6318       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6319       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6320                                        vect[1], shift4_mask);
6321       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6322       (*result_chain)[0] = data_ref;
6323       return true;
6324     }
6325   return false;
6326 }
6327
6328 /* Function vect_transform_grouped_load.
6329
6330    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6331    to perform their permutation and ascribe the result vectorized statements to
6332    the scalar statements.
6333 */
6334
6335 void
6336 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6337                              int size, gimple_stmt_iterator *gsi)
6338 {
6339   machine_mode mode;
6340   vec<tree> result_chain = vNULL;
6341
6342   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6343      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6344      vectors, that are ready for vector computation.  */
6345   result_chain.create (size);
6346
6347   /* If reassociation width for vector type is 2 or greater target machine can
6348      execute 2 or more vector instructions in parallel.  Otherwise try to
6349      get chain for loads group using vect_shift_permute_load_chain.  */
6350   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6351   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6352       || pow2p_hwi (size)
6353       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6354                                          gsi, &result_chain))
6355     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6356   vect_record_grouped_load_vectors (stmt_info, result_chain);
6357   result_chain.release ();
6358 }
6359
6360 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6361    generated as part of the vectorization of STMT_INFO.  Assign the statement
6362    for each vector to the associated scalar statement.  */
6363
6364 void
6365 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6366                                   vec<tree> result_chain)
6367 {
6368   vec_info *vinfo = stmt_info->vinfo;
6369   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6370   unsigned int i, gap_count;
6371   tree tmp_data_ref;
6372
6373   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6374      Since we scan the chain starting from it's first node, their order
6375      corresponds the order of data-refs in RESULT_CHAIN.  */
6376   stmt_vec_info next_stmt_info = first_stmt_info;
6377   gap_count = 1;
6378   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6379     {
6380       if (!next_stmt_info)
6381         break;
6382
6383       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6384        code elimination pass later.  No need to check for the first stmt in
6385        the group, since it always exists.
6386        DR_GROUP_GAP is the number of steps in elements from the previous
6387        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6388        correspond to the gaps.  */
6389       if (next_stmt_info != first_stmt_info
6390           && gap_count < DR_GROUP_GAP (next_stmt_info))
6391       {
6392         gap_count++;
6393         continue;
6394       }
6395
6396       while (next_stmt_info)
6397         {
6398           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6399           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6400              copies, and we put the new vector statement in the first available
6401              RELATED_STMT.  */
6402           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6403             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6404           else
6405             {
6406               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6407                 {
6408                   stmt_vec_info prev_stmt_info
6409                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6410                   stmt_vec_info rel_stmt_info
6411                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6412                   while (rel_stmt_info)
6413                     {
6414                       prev_stmt_info = rel_stmt_info;
6415                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6416                     }
6417
6418                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6419                 }
6420             }
6421
6422           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6423           gap_count = 1;
6424           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6425              put the same TMP_DATA_REF as its vectorized statement; otherwise
6426              get the next data-ref from RESULT_CHAIN.  */
6427           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6428             break;
6429         }
6430     }
6431 }
6432
6433 /* Function vect_force_dr_alignment_p.
6434
6435    Returns whether the alignment of a DECL can be forced to be aligned
6436    on ALIGNMENT bit boundary.  */
6437
6438 bool
6439 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6440 {
6441   if (!VAR_P (decl))
6442     return false;
6443
6444   if (decl_in_symtab_p (decl)
6445       && !symtab_node::get (decl)->can_increase_alignment_p ())
6446     return false;
6447
6448   if (TREE_STATIC (decl))
6449     return (alignment <= MAX_OFILE_ALIGNMENT);
6450   else
6451     return (alignment <= MAX_STACK_ALIGNMENT);
6452 }
6453
6454
6455 /* Return whether the data reference DR_INFO is supported with respect to its
6456    alignment.
6457    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6458    it is aligned, i.e., check if it is possible to vectorize it with different
6459    alignment.  */
6460
6461 enum dr_alignment_support
6462 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6463                                bool check_aligned_accesses)
6464 {
6465   data_reference *dr = dr_info->dr;
6466   stmt_vec_info stmt_info = dr_info->stmt;
6467   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6468   machine_mode mode = TYPE_MODE (vectype);
6469   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6470   struct loop *vect_loop = NULL;
6471   bool nested_in_vect_loop = false;
6472
6473   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6474     return dr_aligned;
6475
6476   /* For now assume all conditional loads/stores support unaligned
6477      access without any special code.  */
6478   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6479     if (gimple_call_internal_p (stmt)
6480         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6481             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6482       return dr_unaligned_supported;
6483
6484   if (loop_vinfo)
6485     {
6486       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6487       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6488     }
6489
6490   /* Possibly unaligned access.  */
6491
6492   /* We can choose between using the implicit realignment scheme (generating
6493      a misaligned_move stmt) and the explicit realignment scheme (generating
6494      aligned loads with a REALIGN_LOAD).  There are two variants to the
6495      explicit realignment scheme: optimized, and unoptimized.
6496      We can optimize the realignment only if the step between consecutive
6497      vector loads is equal to the vector size.  Since the vector memory
6498      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6499      is guaranteed that the misalignment amount remains the same throughout the
6500      execution of the vectorized loop.  Therefore, we can create the
6501      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6502      at the loop preheader.
6503
6504      However, in the case of outer-loop vectorization, when vectorizing a
6505      memory access in the inner-loop nested within the LOOP that is now being
6506      vectorized, while it is guaranteed that the misalignment of the
6507      vectorized memory access will remain the same in different outer-loop
6508      iterations, it is *not* guaranteed that is will remain the same throughout
6509      the execution of the inner-loop.  This is because the inner-loop advances
6510      with the original scalar step (and not in steps of VS).  If the inner-loop
6511      step happens to be a multiple of VS, then the misalignment remains fixed
6512      and we can use the optimized realignment scheme.  For example:
6513
6514       for (i=0; i<N; i++)
6515         for (j=0; j<M; j++)
6516           s += a[i+j];
6517
6518      When vectorizing the i-loop in the above example, the step between
6519      consecutive vector loads is 1, and so the misalignment does not remain
6520      fixed across the execution of the inner-loop, and the realignment cannot
6521      be optimized (as illustrated in the following pseudo vectorized loop):
6522
6523       for (i=0; i<N; i+=4)
6524         for (j=0; j<M; j++){
6525           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6526                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6527                          // (assuming that we start from an aligned address).
6528           }
6529
6530      We therefore have to use the unoptimized realignment scheme:
6531
6532       for (i=0; i<N; i+=4)
6533           for (j=k; j<M; j+=4)
6534           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6535                            // that the misalignment of the initial address is
6536                            // 0).
6537
6538      The loop can then be vectorized as follows:
6539
6540       for (k=0; k<4; k++){
6541         rt = get_realignment_token (&vp[k]);
6542         for (i=0; i<N; i+=4){
6543           v1 = vp[i+k];
6544           for (j=k; j<M; j+=4){
6545             v2 = vp[i+j+VS-1];
6546             va = REALIGN_LOAD <v1,v2,rt>;
6547             vs += va;
6548             v1 = v2;
6549           }
6550         }
6551     } */
6552
6553   if (DR_IS_READ (dr))
6554     {
6555       bool is_packed = false;
6556       tree type = (TREE_TYPE (DR_REF (dr)));
6557
6558       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6559           && (!targetm.vectorize.builtin_mask_for_load
6560               || targetm.vectorize.builtin_mask_for_load ()))
6561         {
6562           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6563
6564           /* If we are doing SLP then the accesses need not have the
6565              same alignment, instead it depends on the SLP group size.  */
6566           if (loop_vinfo
6567               && STMT_SLP_TYPE (stmt_info)
6568               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569                               * (DR_GROUP_SIZE
6570                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6571                               TYPE_VECTOR_SUBPARTS (vectype)))
6572             ;
6573           else if (!loop_vinfo
6574                    || (nested_in_vect_loop
6575                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6576                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6577             return dr_explicit_realign;
6578           else
6579             return dr_explicit_realign_optimized;
6580         }
6581       if (!known_alignment_for_access_p (dr_info))
6582         is_packed = not_size_aligned (DR_REF (dr));
6583
6584       if (targetm.vectorize.support_vector_misalignment
6585             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6586         /* Can't software pipeline the loads, but can at least do them.  */
6587         return dr_unaligned_supported;
6588     }
6589   else
6590     {
6591       bool is_packed = false;
6592       tree type = (TREE_TYPE (DR_REF (dr)));
6593
6594       if (!known_alignment_for_access_p (dr_info))
6595         is_packed = not_size_aligned (DR_REF (dr));
6596
6597      if (targetm.vectorize.support_vector_misalignment
6598            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6599        return dr_unaligned_supported;
6600     }
6601
6602   /* Unsupported.  */
6603   return dr_unaligned_unsupported;
6604 }