gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return (!STMT_VINFO_GATHER_SCATTER_P
 271                      (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
 272                   && !STMT_VINFO_GATHER_SCATTER_P
 273                         (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
 298   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
 299   lambda_vector dist_v;
 300   unsigned int loop_depth;
 301
 302   /* In loop analysis all data references should be vectorizable.  */
 303   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 304       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 305     gcc_unreachable ();
 306
 307   /* Independent data accesses.  */
 308   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 309     return false;
 310
 311   if (dra == drb
 312       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 313     return false;
 314
 315   /* We do not have to consider dependences between accesses that belong
 316      to the same group, unless the stride could be smaller than the
 317      group size.  */
 318   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 319       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 321       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 322     return false;
 323
 324   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 325      least two scalar iterations, there is always also a true dependence.
 326      As the vectorizer does not re-order loads and stores we can ignore
 327      the anti-dependence if TBAA can disambiguate both DRs similar to the
 328      case with known negative distance anti-dependences (positive
 329      distance anti-dependences would violate TBAA constraints).  */
 330   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 331        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 332       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 333                                  get_alias_set (DR_REF (drb))))
 334     return false;
 335
 336   /* Unknown data dependence.  */
 337   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 338     {
 339       /* If user asserted safelen consecutive iterations can be
 340          executed concurrently, assume independence.  */
 341       if (loop->safelen >= 2)
 342         {
 343           if ((unsigned int) loop->safelen < *max_vf)
 344             *max_vf = loop->safelen;
 345           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 346           return false;
 347         }
 348
 349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 350           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                                "versioning for alias not supported for: "
 356                                "can't determine dependence between ");
 357               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 358                                  DR_REF (dra));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 360               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 361                                  DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364           return true;
 365         }
 366
 367       if (dump_enabled_p ())
 368         {
 369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                            "versioning for alias required: "
 371                            "can't determine dependence between ");
 372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 373                              DR_REF (dra));
 374           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 375           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 376                              DR_REF (drb));
 377           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 378         }
 379
 380       /* Add to list of ddrs that need to be tested at run-time.  */
 381       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 382     }
 383
 384   /* Known data dependence.  */
 385   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 386     {
 387       /* If user asserted safelen consecutive iterations can be
 388          executed concurrently, assume independence.  */
 389       if (loop->safelen >= 2)
 390         {
 391           if ((unsigned int) loop->safelen < *max_vf)
 392             *max_vf = loop->safelen;
 393           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 394           return false;
 395         }
 396
 397       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 398           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 399         {
 400           if (dump_enabled_p ())
 401             {
 402               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                "versioning for alias not supported for: "
 404                                "bad dist vector for ");
 405               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                  DR_REF (dra));
 407               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 408               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 409                                  DR_REF (drb));
 410               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411             }
 412           return true;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                            "versioning for alias required: "
 419                            "bad dist vector for ");
 420           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 423           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 424         }
 425       /* Add to list of ddrs that need to be tested at run-time.  */
 426       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 427     }
 428
 429   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 430
 431   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 432       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 433                                                 loop_depth, max_vf))
 434     return false;
 435
 436   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 437     {
 438       int dist = dist_v[loop_depth];
 439
 440       if (dump_enabled_p ())
 441         dump_printf_loc (MSG_NOTE, vect_location,
 442                          "dependence distance  = %d.\n", dist);
 443
 444       if (dist == 0)
 445         {
 446           if (dump_enabled_p ())
 447             {
 448               dump_printf_loc (MSG_NOTE, vect_location,
 449                                "dependence distance == 0 between ");
 450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 451               dump_printf (MSG_NOTE, " and ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 453               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 454             }
 455
 456           /* When we perform grouped accesses and perform implicit CSE
 457              by detecting equal accesses and doing disambiguation with
 458              runtime alias tests like for
 459                 .. = a[i];
 460                 .. = a[i+1];
 461                 a[i] = ..;
 462                 a[i+1] = ..;
 463                 *p = ..;
 464                 .. = a[i];
 465                 .. = a[i+1];
 466              where we will end up loading { a[i], a[i+1] } once, make
 467              sure that inserting group loads before the first load and
 468              stores after the last store will do the right thing.
 469              Similar for groups like
 470                 a[i] = ...;
 471                 ... = a[i];
 472                 a[i+1] = ...;
 473              where loads from the group interleave with the store.  */
 474           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 475                                               vect_dr_stmt (drb)))
 476             {
 477               if (dump_enabled_p ())
 478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 479                                  "READ_WRITE dependence in interleaving.\n");
 480               return true;
 481             }
 482
 483           if (loop->safelen < 2)
 484             {
 485               tree indicator = dr_zero_step_indicator (dra);
 486               if (!indicator || integer_zerop (indicator))
 487                 {
 488                   if (dump_enabled_p ())
 489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                                  "access also has a zero step\n");
 491                   return true;
 492                 }
 493               else if (TREE_CODE (indicator) != INTEGER_CST)
 494                 vect_check_nonzero_value (loop_vinfo, indicator);
 495             }
 496           continue;
 497         }
 498
 499       if (dist > 0 && DDR_REVERSED_P (ddr))
 500         {
 501           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 502              reversed (to make distance vector positive), and the actual
 503              distance is negative.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 506                              "dependence distance negative.\n");
 507           /* Record a negative dependence distance to later limit the
 508              amount of stmt copying / unrolling we can perform.
 509              Only need to handle read-after-write dependence.  */
 510           if (DR_IS_READ (drb)
 511               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 512                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 513             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 514           continue;
 515         }
 516
 517       unsigned int abs_dist = abs (dist);
 518       if (abs_dist >= 2 && abs_dist < *max_vf)
 519         {
 520           /* The dependence distance requires reduction of the maximal
 521              vectorization factor.  */
 522           *max_vf = abs (dist);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "adjusting maximal vectorization factor to %i\n",
 526                              *max_vf);
 527         }
 528
 529       if (abs_dist >= *max_vf)
 530         {
 531           /* Dependence distance does not create dependence, as far as
 532              vectorization is concerned, in this case.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance >= VF.\n");
 536           continue;
 537         }
 538
 539       if (dump_enabled_p ())
 540         {
 541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 542                        "not vectorized, possible dependence "
 543                        "between data-refs ");
 544           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_NOTE,  " and ");
 546           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_NOTE,  "\n");
 548         }
 549
 550       return true;
 551     }
 552
 553   return false;
 554 }
 555
 556 /* Function vect_analyze_data_ref_dependences.
 557
 558    Examine all the data references in the loop, and make sure there do not
 559    exist any data dependences between them.  Set *MAX_VF according to
 560    the maximum vectorization factor the data dependences allow.  */
 561
 562 bool
 563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 564                                    unsigned int *max_vf)
 565 {
 566   unsigned int i;
 567   struct data_dependence_relation *ddr;
 568
 569   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 570
 571   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 572     {
 573       LOOP_VINFO_DDRS (loop_vinfo)
 574         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 575                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 576       /* We need read-read dependences to compute
 577          STMT_VINFO_SAME_ALIGN_REFS.  */
 578       if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 579                                     &LOOP_VINFO_DDRS (loop_vinfo),
 580                                     LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 581         return false;
 582     }
 583
 584   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 585
 586   /* For epilogues we either have no aliases or alias versioning
 587      was applied to original loop.  Therefore we may just get max_vf
 588      using VF of original loop.  */
 589   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 590     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 591   else
 592     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 593       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 594         return false;
 595
 596   return true;
 597 }
 598
 599
 600 /* Function vect_slp_analyze_data_ref_dependence.
 601
 602    Return TRUE if there (might) exist a dependence between a memory-reference
 603    DRA and a memory-reference DRB.  When versioning for alias may check a
 604    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 605    the data dependence.  */
 606
 607 static bool
 608 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 609 {
 610   struct data_reference *dra = DDR_A (ddr);
 611   struct data_reference *drb = DDR_B (ddr);
 612
 613   /* We need to check dependences of statements marked as unvectorizable
 614      as well, they still can prohibit vectorization.  */
 615
 616   /* Independent data accesses.  */
 617   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 618     return false;
 619
 620   if (dra == drb)
 621     return false;
 622
 623   /* Read-read is OK.  */
 624   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 625     return false;
 626
 627   /* If dra and drb are part of the same interleaving chain consider
 628      them independent.  */
 629   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
 630       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
 631           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
 632     return false;
 633
 634   /* Unknown data dependence.  */
 635   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 636     {
 637       if  (dump_enabled_p ())
 638         {
 639           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                            "can't determine dependence between ");
 641           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 642           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 643           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 644           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 645         }
 646     }
 647   else if (dump_enabled_p ())
 648     {
 649       dump_printf_loc (MSG_NOTE, vect_location,
 650                        "determined dependence between ");
 651       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 652       dump_printf (MSG_NOTE, " and ");
 653       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 654       dump_printf (MSG_NOTE,  "\n");
 655     }
 656
 657   return true;
 658 }
 659
 660
 661 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 662    contain the vector of scalar stores of this instance if we are
 663    disambiguating the loads.  */
 664
 665 static bool
 666 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 667                                    vec<gimple *> stores, gimple *last_store)
 668 {
 669   /* This walks over all stmts involved in the SLP load/store done
 670      in NODE verifying we can sink them up to the last stmt in the
 671      group.  */
 672   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 673   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 674     {
 675       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 676       if (access == last_access)
 677         continue;
 678       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 679       ao_ref ref;
 680       bool ref_initialized_p = false;
 681       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 682            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 683         {
 684           gimple *stmt = gsi_stmt (gsi);
 685           if (! gimple_vuse (stmt)
 686               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 687             continue;
 688
 689           /* If we couldn't record a (single) data reference for this
 690              stmt we have to resort to the alias oracle.  */
 691           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 692           if (!dr_b)
 693             {
 694               /* We are moving a store or sinking a load - this means
 695                  we cannot use TBAA for disambiguation.  */
 696               if (!ref_initialized_p)
 697                 ao_ref_init (&ref, DR_REF (dr_a));
 698               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 699                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 700                 return false;
 701               continue;
 702             }
 703
 704           bool dependent = false;
 705           /* If we run into a store of this same instance (we've just
 706              marked those) then delay dependence checking until we run
 707              into the last store because this is where it will have
 708              been sunk to (and we verify if we can do that as well).  */
 709           if (gimple_visited_p (stmt))
 710             {
 711               if (stmt != last_store)
 712                 continue;
 713               unsigned i;
 714               gimple *store;
 715               FOR_EACH_VEC_ELT (stores, i, store)
 716                 {
 717                   data_reference *store_dr
 718                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 719                   ddr_p ddr = initialize_data_dependence_relation
 720                                 (dr_a, store_dr, vNULL);
 721                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 722                   free_dependence_relation (ddr);
 723                   if (dependent)
 724                     break;
 725                 }
 726             }
 727           else
 728             {
 729               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 730                                                                dr_b, vNULL);
 731               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 732               free_dependence_relation (ddr);
 733             }
 734           if (dependent)
 735             return false;
 736         }
 737     }
 738   return true;
 739 }
 740
 741
 742 /* Function vect_analyze_data_ref_dependences.
 743
 744    Examine all the data references in the basic-block, and make sure there
 745    do not exist any data dependences between them.  Set *MAX_VF according to
 746    the maximum vectorization factor the data dependences allow.  */
 747
 748 bool
 749 vect_slp_analyze_instance_dependence (slp_instance instance)
 750 {
 751   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 752
 753   /* The stores of this instance are at the root of the SLP tree.  */
 754   slp_tree store = SLP_INSTANCE_TREE (instance);
 755   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 756     store = NULL;
 757
 758   /* Verify we can sink stores to the vectorized stmt insert location.  */
 759   gimple *last_store = NULL;
 760   if (store)
 761     {
 762       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 763         return false;
 764
 765       /* Mark stores in this instance and remember the last one.  */
 766       last_store = vect_find_last_scalar_stmt_in_slp (store);
 767       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 768         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 769     }
 770
 771   bool res = true;
 772
 773   /* Verify we can sink loads to the vectorized stmt insert location,
 774      special-casing stores of this instance.  */
 775   slp_tree load;
 776   unsigned int i;
 777   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 778     if (! vect_slp_analyze_node_dependences (instance, load,
 779                                              store
 780                                              ? SLP_TREE_SCALAR_STMTS (store)
 781                                              : vNULL, last_store))
 782       {
 783         res = false;
 784         break;
 785       }
 786
 787   /* Unset the visited flag.  */
 788   if (store)
 789     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 790       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 791
 792   return res;
 793 }
 794
 795 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 796    the statement that contains DRB, which is useful for recording in the
 797    dump file.  */
 798
 799 static void
 800 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 801                             innermost_loop_behavior *drb)
 802 {
 803   bool existed;
 804   innermost_loop_behavior *&entry
 805     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 806   if (!existed || entry->base_alignment < drb->base_alignment)
 807     {
 808       entry = drb;
 809       if (dump_enabled_p ())
 810         {
 811           dump_printf_loc (MSG_NOTE, vect_location,
 812                            "recording new base alignment for ");
 813           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 814           dump_printf (MSG_NOTE, "\n");
 815           dump_printf_loc (MSG_NOTE, vect_location,
 816                            "  alignment:    %d\n", drb->base_alignment);
 817           dump_printf_loc (MSG_NOTE, vect_location,
 818                            "  misalignment: %d\n", drb->base_misalignment);
 819           dump_printf_loc (MSG_NOTE, vect_location,
 820                            "  based on:     ");
 821           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 822         }
 823     }
 824 }
 825
 826 /* If the region we're going to vectorize is reached, all unconditional
 827    data references occur at least once.  We can therefore pool the base
 828    alignment guarantees from each unconditional reference.  Do this by
 829    going through all the data references in VINFO and checking whether
 830    the containing statement makes the reference unconditionally.  If so,
 831    record the alignment of the base address in VINFO so that it can be
 832    used for all other references with the same base.  */
 833
 834 void
 835 vect_record_base_alignments (vec_info *vinfo)
 836 {
 837   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 838   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 839   data_reference *dr;
 840   unsigned int i;
 841   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 842     {
 843       gimple *stmt = vect_dr_stmt (dr);
 844       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 845       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 846           && STMT_VINFO_VECTORIZABLE (stmt_info)
 847           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 848         {
 849           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 850
 851           /* If DR is nested in the loop that is being vectorized, we can also
 852              record the alignment of the base wrt the outer loop.  */
 853           if (loop && nested_in_vect_loop_p (loop, stmt))
 854             vect_record_base_alignment
 855                 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 856         }
 857     }
 858 }
 859
 860 /* Return the target alignment for the vectorized form of DR.  */
 861
 862 static unsigned int
 863 vect_calculate_target_alignment (struct data_reference *dr)
 864 {
 865   gimple *stmt = vect_dr_stmt (dr);
 866   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 867   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 868   return targetm.vectorize.preferred_vector_alignment (vectype);
 869 }
 870
 871 /* Function vect_compute_data_ref_alignment
 872
 873    Compute the misalignment of the data reference DR.
 874
 875    Output:
 876    1. DR_MISALIGNMENT (DR) is defined.
 877
 878    FOR NOW: No analysis is actually performed. Misalignment is calculated
 879    only for trivial cases. TODO.  */
 880
 881 static void
 882 vect_compute_data_ref_alignment (struct data_reference *dr)
 883 {
 884   gimple *stmt = vect_dr_stmt (dr);
 885   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 886   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 887   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 888   struct loop *loop = NULL;
 889   tree ref = DR_REF (dr);
 890   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 891
 892   if (dump_enabled_p ())
 893     dump_printf_loc (MSG_NOTE, vect_location,
 894                      "vect_compute_data_ref_alignment:\n");
 895
 896   if (loop_vinfo)
 897     loop = LOOP_VINFO_LOOP (loop_vinfo);
 898
 899   /* Initialize misalignment to unknown.  */
 900   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 901
 902   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 903     return;
 904
 905   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 906   bool step_preserves_misalignment_p;
 907
 908   unsigned HOST_WIDE_INT vector_alignment
 909     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 910   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 911
 912   /* No step for BB vectorization.  */
 913   if (!loop)
 914     {
 915       gcc_assert (integer_zerop (drb->step));
 916       step_preserves_misalignment_p = true;
 917     }
 918
 919   /* In case the dataref is in an inner-loop of the loop that is being
 920      vectorized (LOOP), we use the base and misalignment information
 921      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 922      stays the same throughout the execution of the inner-loop, which is why
 923      we have to check that the stride of the dataref in the inner-loop evenly
 924      divides by the vector alignment.  */
 925   else if (nested_in_vect_loop_p (loop, stmt))
 926     {
 927       step_preserves_misalignment_p
 928         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 929
 930       if (dump_enabled_p ())
 931         {
 932           if (step_preserves_misalignment_p)
 933             dump_printf_loc (MSG_NOTE, vect_location,
 934                              "inner step divides the vector alignment.\n");
 935           else
 936             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 937                              "inner step doesn't divide the vector"
 938                              " alignment.\n");
 939         }
 940     }
 941
 942   /* Similarly we can only use base and misalignment information relative to
 943      an innermost loop if the misalignment stays the same throughout the
 944      execution of the loop.  As above, this is the case if the stride of
 945      the dataref evenly divides by the alignment.  */
 946   else
 947     {
 948       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 949       step_preserves_misalignment_p
 950         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 951
 952       if (!step_preserves_misalignment_p && dump_enabled_p ())
 953         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 954                          "step doesn't divide the vector alignment.\n");
 955     }
 956
 957   unsigned int base_alignment = drb->base_alignment;
 958   unsigned int base_misalignment = drb->base_misalignment;
 959
 960   /* Calculate the maximum of the pooled base address alignment and the
 961      alignment that we can compute for DR itself.  */
 962   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 963   if (entry && base_alignment < (*entry)->base_alignment)
 964     {
 965       base_alignment = (*entry)->base_alignment;
 966       base_misalignment = (*entry)->base_misalignment;
 967     }
 968
 969   if (drb->offset_alignment < vector_alignment
 970       || !step_preserves_misalignment_p
 971       /* We need to know whether the step wrt the vectorized loop is
 972          negative when computing the starting misalignment below.  */
 973       || TREE_CODE (drb->step) != INTEGER_CST)
 974     {
 975       if (dump_enabled_p ())
 976         {
 977           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 978                            "Unknown alignment for access: ");
 979           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 980           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 981         }
 982       return;
 983     }
 984
 985   if (base_alignment < vector_alignment)
 986     {
 987       unsigned int max_alignment;
 988       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 989       if (max_alignment < vector_alignment
 990           || !vect_can_force_dr_alignment_p (base,
 991                                              vector_alignment * BITS_PER_UNIT))
 992         {
 993           if (dump_enabled_p ())
 994             {
 995               dump_printf_loc (MSG_NOTE, vect_location,
 996                                "can't force alignment of ref: ");
 997               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 998               dump_printf (MSG_NOTE, "\n");
 999             }
1000           return;
1001         }
1002
1003       /* Force the alignment of the decl.
1004          NOTE: This is the only change to the code we make during
1005          the analysis phase, before deciding to vectorize the loop.  */
1006       if (dump_enabled_p ())
1007         {
1008           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1009           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1010           dump_printf (MSG_NOTE, "\n");
1011         }
1012
1013       DR_VECT_AUX (dr)->base_decl = base;
1014       DR_VECT_AUX (dr)->base_misaligned = true;
1015       base_misalignment = 0;
1016     }
1017   poly_int64 misalignment
1018     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1019
1020   /* If this is a backward running DR then first access in the larger
1021      vectype actually is N-1 elements before the address in the DR.
1022      Adjust misalign accordingly.  */
1023   if (tree_int_cst_sgn (drb->step) < 0)
1024     /* PLUS because STEP is negative.  */
1025     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1026                      * TREE_INT_CST_LOW (drb->step));
1027
1028   unsigned int const_misalignment;
1029   if (!known_misalignment (misalignment, vector_alignment,
1030                            &const_misalignment))
1031     {
1032       if (dump_enabled_p ())
1033         {
1034           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                            "Non-constant misalignment for access: ");
1036           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1037           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1038         }
1039       return;
1040     }
1041
1042   SET_DR_MISALIGNMENT (dr, const_misalignment);
1043
1044   if (dump_enabled_p ())
1045     {
1046       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1047                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1048       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1049       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1050     }
1051
1052   return;
1053 }
1054
1055 /* Function vect_update_misalignment_for_peel.
1056    Sets DR's misalignment
1057    - to 0 if it has the same alignment as DR_PEEL,
1058    - to the misalignment computed using NPEEL if DR's salignment is known,
1059    - to -1 (unknown) otherwise.
1060
1061    DR - the data reference whose misalignment is to be adjusted.
1062    DR_PEEL - the data reference whose misalignment is being made
1063              zero in the vector loop by the peel.
1064    NPEEL - the number of iterations in the peel loop if the misalignment
1065            of DR_PEEL is known at compile time.  */
1066
1067 static void
1068 vect_update_misalignment_for_peel (struct data_reference *dr,
1069                                    struct data_reference *dr_peel, int npeel)
1070 {
1071   unsigned int i;
1072   vec<dr_p> same_aligned_drs;
1073   struct data_reference *current_dr;
1074   int dr_size = vect_get_scalar_dr_size (dr);
1075   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1076   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1077   stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1078
1079  /* For interleaved data accesses the step in the loop must be multiplied by
1080      the size of the interleaving group.  */
1081   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1082     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1083   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1084     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1085
1086   /* It can be assumed that the data refs with the same alignment as dr_peel
1087      are aligned in the vector loop.  */
1088   same_aligned_drs
1089     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1090   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1091     {
1092       if (current_dr != dr)
1093         continue;
1094       gcc_assert (!known_alignment_for_access_p (dr)
1095                   || !known_alignment_for_access_p (dr_peel)
1096                   || (DR_MISALIGNMENT (dr) / dr_size
1097                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1098       SET_DR_MISALIGNMENT (dr, 0);
1099       return;
1100     }
1101
1102   if (known_alignment_for_access_p (dr)
1103       && known_alignment_for_access_p (dr_peel))
1104     {
1105       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1106       int misal = DR_MISALIGNMENT (dr);
1107       misal += negative ? -npeel * dr_size : npeel * dr_size;
1108       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1109       SET_DR_MISALIGNMENT (dr, misal);
1110       return;
1111     }
1112
1113   if (dump_enabled_p ())
1114     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1115                      "to unknown (-1).\n");
1116   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1117 }
1118
1119
1120 /* Function verify_data_ref_alignment
1121
1122    Return TRUE if DR can be handled with respect to alignment.  */
1123
1124 static bool
1125 verify_data_ref_alignment (data_reference_p dr)
1126 {
1127   enum dr_alignment_support supportable_dr_alignment
1128     = vect_supportable_dr_alignment (dr, false);
1129   if (!supportable_dr_alignment)
1130     {
1131       if (dump_enabled_p ())
1132         {
1133           if (DR_IS_READ (dr))
1134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1135                              "not vectorized: unsupported unaligned load.");
1136           else
1137             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1138                              "not vectorized: unsupported unaligned "
1139                              "store.");
1140
1141           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1142                              DR_REF (dr));
1143           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1144         }
1145       return false;
1146     }
1147
1148   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1149     dump_printf_loc (MSG_NOTE, vect_location,
1150                      "Vectorizing an unaligned access.\n");
1151
1152   return true;
1153 }
1154
1155 /* Function vect_verify_datarefs_alignment
1156
1157    Return TRUE if all data references in the loop can be
1158    handled with respect to alignment.  */
1159
1160 bool
1161 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1162 {
1163   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1164   struct data_reference *dr;
1165   unsigned int i;
1166
1167   FOR_EACH_VEC_ELT (datarefs, i, dr)
1168     {
1169       gimple *stmt = vect_dr_stmt (dr);
1170       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1171
1172       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1173         continue;
1174
1175       /* For interleaving, only the alignment of the first access matters.   */
1176       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1177           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1178         continue;
1179
1180       /* Strided accesses perform only component accesses, alignment is
1181          irrelevant for them.  */
1182       if (STMT_VINFO_STRIDED_P (stmt_info)
1183           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1184         continue;
1185
1186       if (! verify_data_ref_alignment (dr))
1187         return false;
1188     }
1189
1190   return true;
1191 }
1192
1193 /* Given an memory reference EXP return whether its alignment is less
1194    than its size.  */
1195
1196 static bool
1197 not_size_aligned (tree exp)
1198 {
1199   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1200     return true;
1201
1202   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1203           > get_object_alignment (exp));
1204 }
1205
1206 /* Function vector_alignment_reachable_p
1207
1208    Return true if vector alignment for DR is reachable by peeling
1209    a few loop iterations.  Return false otherwise.  */
1210
1211 static bool
1212 vector_alignment_reachable_p (struct data_reference *dr)
1213 {
1214   gimple *stmt = vect_dr_stmt (dr);
1215   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1216   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1217
1218   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1219     {
1220       /* For interleaved access we peel only if number of iterations in
1221          the prolog loop ({VF - misalignment}), is a multiple of the
1222          number of the interleaved accesses.  */
1223       int elem_size, mis_in_elements;
1224
1225       /* FORNOW: handle only known alignment.  */
1226       if (!known_alignment_for_access_p (dr))
1227         return false;
1228
1229       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1230       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1231       elem_size = vector_element_size (vector_size, nelements);
1232       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1233
1234       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1235         return false;
1236     }
1237
1238   /* If misalignment is known at the compile time then allow peeling
1239      only if natural alignment is reachable through peeling.  */
1240   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1241     {
1242       HOST_WIDE_INT elmsize =
1243                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1244       if (dump_enabled_p ())
1245         {
1246           dump_printf_loc (MSG_NOTE, vect_location,
1247                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1248           dump_printf (MSG_NOTE,
1249                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1250         }
1251       if (DR_MISALIGNMENT (dr) % elmsize)
1252         {
1253           if (dump_enabled_p ())
1254             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1255                              "data size does not divide the misalignment.\n");
1256           return false;
1257         }
1258     }
1259
1260   if (!known_alignment_for_access_p (dr))
1261     {
1262       tree type = TREE_TYPE (DR_REF (dr));
1263       bool is_packed = not_size_aligned (DR_REF (dr));
1264       if (dump_enabled_p ())
1265         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1266                          "Unknown misalignment, %snaturally aligned\n",
1267                          is_packed ? "not " : "");
1268       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1269     }
1270
1271   return true;
1272 }
1273
1274
1275 /* Calculate the cost of the memory access represented by DR.  */
1276
1277 static void
1278 vect_get_data_access_cost (struct data_reference *dr,
1279                            unsigned int *inside_cost,
1280                            unsigned int *outside_cost,
1281                            stmt_vector_for_cost *body_cost_vec,
1282                            stmt_vector_for_cost *prologue_cost_vec)
1283 {
1284   gimple *stmt = vect_dr_stmt (dr);
1285   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1286   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1287   int ncopies;
1288
1289   if (PURE_SLP_STMT (stmt_info))
1290     ncopies = 1;
1291   else
1292     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1293
1294   if (DR_IS_READ (dr))
1295     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1296                         prologue_cost_vec, body_cost_vec, false);
1297   else
1298     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1299
1300   if (dump_enabled_p ())
1301     dump_printf_loc (MSG_NOTE, vect_location,
1302                      "vect_get_data_access_cost: inside_cost = %d, "
1303                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1304 }
1305
1306
1307 typedef struct _vect_peel_info
1308 {
1309   struct data_reference *dr;
1310   int npeel;
1311   unsigned int count;
1312 } *vect_peel_info;
1313
1314 typedef struct _vect_peel_extended_info
1315 {
1316   struct _vect_peel_info peel_info;
1317   unsigned int inside_cost;
1318   unsigned int outside_cost;
1319 } *vect_peel_extended_info;
1320
1321
1322 /* Peeling hashtable helpers.  */
1323
1324 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1325 {
1326   static inline hashval_t hash (const _vect_peel_info *);
1327   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1328 };
1329
1330 inline hashval_t
1331 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1332 {
1333   return (hashval_t) peel_info->npeel;
1334 }
1335
1336 inline bool
1337 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1338 {
1339   return (a->npeel == b->npeel);
1340 }
1341
1342
1343 /* Insert DR into peeling hash table with NPEEL as key.  */
1344
1345 static void
1346 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1347                           loop_vec_info loop_vinfo, struct data_reference *dr,
1348                           int npeel)
1349 {
1350   struct _vect_peel_info elem, *slot;
1351   _vect_peel_info **new_slot;
1352   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1353
1354   elem.npeel = npeel;
1355   slot = peeling_htab->find (&elem);
1356   if (slot)
1357     slot->count++;
1358   else
1359     {
1360       slot = XNEW (struct _vect_peel_info);
1361       slot->npeel = npeel;
1362       slot->dr = dr;
1363       slot->count = 1;
1364       new_slot = peeling_htab->find_slot (slot, INSERT);
1365       *new_slot = slot;
1366     }
1367
1368   if (!supportable_dr_alignment
1369       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1370     slot->count += VECT_MAX_COST;
1371 }
1372
1373
1374 /* Traverse peeling hash table to find peeling option that aligns maximum
1375    number of data accesses.  */
1376
1377 int
1378 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1379                                      _vect_peel_extended_info *max)
1380 {
1381   vect_peel_info elem = *slot;
1382
1383   if (elem->count > max->peel_info.count
1384       || (elem->count == max->peel_info.count
1385           && max->peel_info.npeel > elem->npeel))
1386     {
1387       max->peel_info.npeel = elem->npeel;
1388       max->peel_info.count = elem->count;
1389       max->peel_info.dr = elem->dr;
1390     }
1391
1392   return 1;
1393 }
1394
1395 /* Get the costs of peeling NPEEL iterations checking data access costs
1396    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1397    misalignment will be zero after peeling.  */
1398
1399 static void
1400 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1401                                 struct data_reference *dr0,
1402                                 unsigned int *inside_cost,
1403                                 unsigned int *outside_cost,
1404                                 stmt_vector_for_cost *body_cost_vec,
1405                                 stmt_vector_for_cost *prologue_cost_vec,
1406                                 unsigned int npeel,
1407                                 bool unknown_misalignment)
1408 {
1409   unsigned i;
1410   data_reference *dr;
1411
1412   FOR_EACH_VEC_ELT (datarefs, i, dr)
1413     {
1414       gimple *stmt = vect_dr_stmt (dr);
1415       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1416       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1417         continue;
1418
1419       /* For interleaving, only the alignment of the first access
1420          matters.  */
1421       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1422           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1423         continue;
1424
1425       /* Strided accesses perform only component accesses, alignment is
1426          irrelevant for them.  */
1427       if (STMT_VINFO_STRIDED_P (stmt_info)
1428           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1429         continue;
1430
1431       int save_misalignment;
1432       save_misalignment = DR_MISALIGNMENT (dr);
1433       if (npeel == 0)
1434         ;
1435       else if (unknown_misalignment && dr == dr0)
1436         SET_DR_MISALIGNMENT (dr, 0);
1437       else
1438         vect_update_misalignment_for_peel (dr, dr0, npeel);
1439       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1440                                  body_cost_vec, prologue_cost_vec);
1441       SET_DR_MISALIGNMENT (dr, save_misalignment);
1442     }
1443 }
1444
1445 /* Traverse peeling hash table and calculate cost for each peeling option.
1446    Find the one with the lowest cost.  */
1447
1448 int
1449 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1450                                    _vect_peel_extended_info *min)
1451 {
1452   vect_peel_info elem = *slot;
1453   int dummy;
1454   unsigned int inside_cost = 0, outside_cost = 0;
1455   gimple *stmt = vect_dr_stmt (elem->dr);
1456   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1457   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1458   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1459                        epilogue_cost_vec;
1460
1461   prologue_cost_vec.create (2);
1462   body_cost_vec.create (2);
1463   epilogue_cost_vec.create (2);
1464
1465   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1466                                   elem->dr, &inside_cost, &outside_cost,
1467                                   &body_cost_vec, &prologue_cost_vec,
1468                                   elem->npeel, false);
1469
1470   body_cost_vec.release ();
1471
1472   outside_cost += vect_get_known_peeling_cost
1473     (loop_vinfo, elem->npeel, &dummy,
1474      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1475      &prologue_cost_vec, &epilogue_cost_vec);
1476
1477   /* Prologue and epilogue costs are added to the target model later.
1478      These costs depend only on the scalar iteration cost, the
1479      number of peeling iterations finally chosen, and the number of
1480      misaligned statements.  So discard the information found here.  */
1481   prologue_cost_vec.release ();
1482   epilogue_cost_vec.release ();
1483
1484   if (inside_cost < min->inside_cost
1485       || (inside_cost == min->inside_cost
1486           && outside_cost < min->outside_cost))
1487     {
1488       min->inside_cost = inside_cost;
1489       min->outside_cost = outside_cost;
1490       min->peel_info.dr = elem->dr;
1491       min->peel_info.npeel = elem->npeel;
1492       min->peel_info.count = elem->count;
1493     }
1494
1495   return 1;
1496 }
1497
1498
1499 /* Choose best peeling option by traversing peeling hash table and either
1500    choosing an option with the lowest cost (if cost model is enabled) or the
1501    option that aligns as many accesses as possible.  */
1502
1503 static struct _vect_peel_extended_info
1504 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1505                                        loop_vec_info loop_vinfo)
1506 {
1507    struct _vect_peel_extended_info res;
1508
1509    res.peel_info.dr = NULL;
1510
1511    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1512      {
1513        res.inside_cost = INT_MAX;
1514        res.outside_cost = INT_MAX;
1515        peeling_htab->traverse <_vect_peel_extended_info *,
1516                                vect_peeling_hash_get_lowest_cost> (&res);
1517      }
1518    else
1519      {
1520        res.peel_info.count = 0;
1521        peeling_htab->traverse <_vect_peel_extended_info *,
1522                                vect_peeling_hash_get_most_frequent> (&res);
1523        res.inside_cost = 0;
1524        res.outside_cost = 0;
1525      }
1526
1527    return res;
1528 }
1529
1530 /* Return true if the new peeling NPEEL is supported.  */
1531
1532 static bool
1533 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1534                           unsigned npeel)
1535 {
1536   unsigned i;
1537   struct data_reference *dr = NULL;
1538   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1539   gimple *stmt;
1540   stmt_vec_info stmt_info;
1541   enum dr_alignment_support supportable_dr_alignment;
1542
1543   /* Ensure that all data refs can be vectorized after the peel.  */
1544   FOR_EACH_VEC_ELT (datarefs, i, dr)
1545     {
1546       int save_misalignment;
1547
1548       if (dr == dr0)
1549         continue;
1550
1551       stmt = vect_dr_stmt (dr);
1552       stmt_info = vinfo_for_stmt (stmt);
1553       /* For interleaving, only the alignment of the first access
1554          matters.  */
1555       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1557         continue;
1558
1559       /* Strided accesses perform only component accesses, alignment is
1560          irrelevant for them.  */
1561       if (STMT_VINFO_STRIDED_P (stmt_info)
1562           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563         continue;
1564
1565       save_misalignment = DR_MISALIGNMENT (dr);
1566       vect_update_misalignment_for_peel (dr, dr0, npeel);
1567       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1568       SET_DR_MISALIGNMENT (dr, save_misalignment);
1569
1570       if (!supportable_dr_alignment)
1571         return false;
1572     }
1573
1574   return true;
1575 }
1576
1577 /* Function vect_enhance_data_refs_alignment
1578
1579    This pass will use loop versioning and loop peeling in order to enhance
1580    the alignment of data references in the loop.
1581
1582    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1583    original loop is to be vectorized.  Any other loops that are created by
1584    the transformations performed in this pass - are not supposed to be
1585    vectorized.  This restriction will be relaxed.
1586
1587    This pass will require a cost model to guide it whether to apply peeling
1588    or versioning or a combination of the two.  For example, the scheme that
1589    intel uses when given a loop with several memory accesses, is as follows:
1590    choose one memory access ('p') which alignment you want to force by doing
1591    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1592    other accesses are not necessarily aligned, or (2) use loop versioning to
1593    generate one loop in which all accesses are aligned, and another loop in
1594    which only 'p' is necessarily aligned.
1595
1596    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1597    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1598    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1599
1600    Devising a cost model is the most critical aspect of this work.  It will
1601    guide us on which access to peel for, whether to use loop versioning, how
1602    many versions to create, etc.  The cost model will probably consist of
1603    generic considerations as well as target specific considerations (on
1604    powerpc for example, misaligned stores are more painful than misaligned
1605    loads).
1606
1607    Here are the general steps involved in alignment enhancements:
1608
1609      -- original loop, before alignment analysis:
1610         for (i=0; i<N; i++){
1611           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1612           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1613         }
1614
1615      -- After vect_compute_data_refs_alignment:
1616         for (i=0; i<N; i++){
1617           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1618           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1619         }
1620
1621      -- Possibility 1: we do loop versioning:
1622      if (p is aligned) {
1623         for (i=0; i<N; i++){    # loop 1A
1624           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1625           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1626         }
1627      }
1628      else {
1629         for (i=0; i<N; i++){    # loop 1B
1630           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1631           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1632         }
1633      }
1634
1635      -- Possibility 2: we do loop peeling:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      for (i = 3; i < N; i++){   # loop 2A
1641         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1642         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1643      }
1644
1645      -- Possibility 3: combination of loop peeling and versioning:
1646      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1647         x = q[i];
1648         p[i] = y;
1649      }
1650      if (p is aligned) {
1651         for (i = 3; i<N; i++){  # loop 3A
1652           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1653           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1654         }
1655      }
1656      else {
1657         for (i = 3; i<N; i++){  # loop 3B
1658           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1659           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1660         }
1661      }
1662
1663      These loops are later passed to loop_transform to be vectorized.  The
1664      vectorizer will use the alignment information to guide the transformation
1665      (whether to generate regular loads/stores, or with special handling for
1666      misalignment).  */
1667
1668 bool
1669 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1670 {
1671   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1672   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1673   enum dr_alignment_support supportable_dr_alignment;
1674   struct data_reference *dr0 = NULL, *first_store = NULL;
1675   struct data_reference *dr;
1676   unsigned int i, j;
1677   bool do_peeling = false;
1678   bool do_versioning = false;
1679   bool stat;
1680   gimple *stmt;
1681   stmt_vec_info stmt_info;
1682   unsigned int npeel = 0;
1683   bool one_misalignment_known = false;
1684   bool one_misalignment_unknown = false;
1685   bool one_dr_unsupportable = false;
1686   struct data_reference *unsupportable_dr = NULL;
1687   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1688   unsigned possible_npeel_number = 1;
1689   tree vectype;
1690   unsigned int mis, same_align_drs_max = 0;
1691   hash_table<peel_info_hasher> peeling_htab (1);
1692
1693   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1694
1695   /* Reset data so we can safely be called multiple times.  */
1696   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1697   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1698
1699   /* While cost model enhancements are expected in the future, the high level
1700      view of the code at this time is as follows:
1701
1702      A) If there is a misaligned access then see if peeling to align
1703         this access can make all data references satisfy
1704         vect_supportable_dr_alignment.  If so, update data structures
1705         as needed and return true.
1706
1707      B) If peeling wasn't possible and there is a data reference with an
1708         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709         then see if loop versioning checks can be used to make all data
1710         references satisfy vect_supportable_dr_alignment.  If so, update
1711         data structures as needed and return true.
1712
1713      C) If neither peeling nor versioning were successful then return false if
1714         any data reference does not satisfy vect_supportable_dr_alignment.
1715
1716      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1717
1718      Note, Possibility 3 above (which is peeling and versioning together) is not
1719      being done at this time.  */
1720
1721   /* (1) Peeling to force alignment.  */
1722
1723   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1724      Considerations:
1725      + How many accesses will become aligned due to the peeling
1726      - How many accesses will become unaligned due to the peeling,
1727        and the cost of misaligned accesses.
1728      - The cost of peeling (the extra runtime checks, the increase
1729        in code size).  */
1730
1731   FOR_EACH_VEC_ELT (datarefs, i, dr)
1732     {
1733       stmt = vect_dr_stmt (dr);
1734       stmt_info = vinfo_for_stmt (stmt);
1735
1736       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1737         continue;
1738
1739       /* For interleaving, only the alignment of the first access
1740          matters.  */
1741       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1743         continue;
1744
1745       /* For scatter-gather or invariant accesses there is nothing
1746          to enhance.  */
1747       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1748           || integer_zerop (DR_STEP (dr)))
1749         continue;
1750
1751       /* Strided accesses perform only component accesses, alignment is
1752          irrelevant for them.  */
1753       if (STMT_VINFO_STRIDED_P (stmt_info)
1754           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755         continue;
1756
1757       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1758       do_peeling = vector_alignment_reachable_p (dr);
1759       if (do_peeling)
1760         {
1761           if (known_alignment_for_access_p (dr))
1762             {
1763               unsigned int npeel_tmp = 0;
1764               bool negative = tree_int_cst_compare (DR_STEP (dr),
1765                                                     size_zero_node) < 0;
1766
1767               vectype = STMT_VINFO_VECTYPE (stmt_info);
1768               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1769               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1770               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1771               if (DR_MISALIGNMENT (dr) != 0)
1772                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1773
1774               /* For multiple types, it is possible that the bigger type access
1775                  will have more than one peeling option.  E.g., a loop with two
1776                  types: one of size (vector size / 4), and the other one of
1777                  size (vector size / 8).  Vectorization factor will 8.  If both
1778                  accesses are misaligned by 3, the first one needs one scalar
1779                  iteration to be aligned, and the second one needs 5.  But the
1780                  first one will be aligned also by peeling 5 scalar
1781                  iterations, and in that case both accesses will be aligned.
1782                  Hence, except for the immediate peeling amount, we also want
1783                  to try to add full vector size, while we don't exceed
1784                  vectorization factor.
1785                  We do this automatically for cost model, since we calculate
1786                  cost for every peeling option.  */
1787               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1788                 {
1789                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1790                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1791                   possible_npeel_number
1792                     = vect_get_num_vectors (nscalars, vectype);
1793
1794                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1795                      allow peeling NELEMENTS.  */
1796                   if (DR_MISALIGNMENT (dr) == 0)
1797                     possible_npeel_number++;
1798                 }
1799
1800               /* Save info about DR in the hash table.  Also include peeling
1801                  amounts according to the explanation above.  */
1802               for (j = 0; j < possible_npeel_number; j++)
1803                 {
1804                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1805                                             dr, npeel_tmp);
1806                   npeel_tmp += target_align / dr_size;
1807                 }
1808
1809               one_misalignment_known = true;
1810             }
1811           else
1812             {
1813               /* If we don't know any misalignment values, we prefer
1814                  peeling for data-ref that has the maximum number of data-refs
1815                  with the same alignment, unless the target prefers to align
1816                  stores over load.  */
1817               unsigned same_align_drs
1818                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1819               if (!dr0
1820                   || same_align_drs_max < same_align_drs)
1821                 {
1822                   same_align_drs_max = same_align_drs;
1823                   dr0 = dr;
1824                 }
1825               /* For data-refs with the same number of related
1826                  accesses prefer the one where the misalign
1827                  computation will be invariant in the outermost loop.  */
1828               else if (same_align_drs_max == same_align_drs)
1829                 {
1830                   struct loop *ivloop0, *ivloop;
1831                   ivloop0 = outermost_invariant_loop_for_expr
1832                     (loop, DR_BASE_ADDRESS (dr0));
1833                   ivloop = outermost_invariant_loop_for_expr
1834                     (loop, DR_BASE_ADDRESS (dr));
1835                   if ((ivloop && !ivloop0)
1836                       || (ivloop && ivloop0
1837                           && flow_loop_nested_p (ivloop, ivloop0)))
1838                     dr0 = dr;
1839                 }
1840
1841               one_misalignment_unknown = true;
1842
1843               /* Check for data refs with unsupportable alignment that
1844                  can be peeled.  */
1845               if (!supportable_dr_alignment)
1846               {
1847                 one_dr_unsupportable = true;
1848                 unsupportable_dr = dr;
1849               }
1850
1851               if (!first_store && DR_IS_WRITE (dr))
1852                 first_store = dr;
1853             }
1854         }
1855       else
1856         {
1857           if (!aligned_access_p (dr))
1858             {
1859               if (dump_enabled_p ())
1860                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                                  "vector alignment may not be reachable\n");
1862               break;
1863             }
1864         }
1865     }
1866
1867   /* Check if we can possibly peel the loop.  */
1868   if (!vect_can_advance_ivs_p (loop_vinfo)
1869       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1870       || loop->inner)
1871     do_peeling = false;
1872
1873   struct _vect_peel_extended_info peel_for_known_alignment;
1874   struct _vect_peel_extended_info peel_for_unknown_alignment;
1875   struct _vect_peel_extended_info best_peel;
1876
1877   peel_for_unknown_alignment.inside_cost = INT_MAX;
1878   peel_for_unknown_alignment.outside_cost = INT_MAX;
1879   peel_for_unknown_alignment.peel_info.count = 0;
1880
1881   if (do_peeling
1882       && one_misalignment_unknown)
1883     {
1884       /* Check if the target requires to prefer stores over loads, i.e., if
1885          misaligned stores are more expensive than misaligned loads (taking
1886          drs with same alignment into account).  */
1887       unsigned int load_inside_cost = 0;
1888       unsigned int load_outside_cost = 0;
1889       unsigned int store_inside_cost = 0;
1890       unsigned int store_outside_cost = 0;
1891       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1892
1893       stmt_vector_for_cost dummy;
1894       dummy.create (2);
1895       vect_get_peeling_costs_all_drs (datarefs, dr0,
1896                                       &load_inside_cost,
1897                                       &load_outside_cost,
1898                                       &dummy, &dummy, estimated_npeels, true);
1899       dummy.release ();
1900
1901       if (first_store)
1902         {
1903           dummy.create (2);
1904           vect_get_peeling_costs_all_drs (datarefs, first_store,
1905                                           &store_inside_cost,
1906                                           &store_outside_cost,
1907                                           &dummy, &dummy,
1908                                           estimated_npeels, true);
1909           dummy.release ();
1910         }
1911       else
1912         {
1913           store_inside_cost = INT_MAX;
1914           store_outside_cost = INT_MAX;
1915         }
1916
1917       if (load_inside_cost > store_inside_cost
1918           || (load_inside_cost == store_inside_cost
1919               && load_outside_cost > store_outside_cost))
1920         {
1921           dr0 = first_store;
1922           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1924         }
1925       else
1926         {
1927           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1928           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1929         }
1930
1931       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932       prologue_cost_vec.create (2);
1933       epilogue_cost_vec.create (2);
1934
1935       int dummy2;
1936       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1937         (loop_vinfo, estimated_npeels, &dummy2,
1938          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939          &prologue_cost_vec, &epilogue_cost_vec);
1940
1941       prologue_cost_vec.release ();
1942       epilogue_cost_vec.release ();
1943
1944       peel_for_unknown_alignment.peel_info.count = 1
1945         + STMT_VINFO_SAME_ALIGN_REFS
1946         (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1947     }
1948
1949   peel_for_unknown_alignment.peel_info.npeel = 0;
1950   peel_for_unknown_alignment.peel_info.dr = dr0;
1951
1952   best_peel = peel_for_unknown_alignment;
1953
1954   peel_for_known_alignment.inside_cost = INT_MAX;
1955   peel_for_known_alignment.outside_cost = INT_MAX;
1956   peel_for_known_alignment.peel_info.count = 0;
1957   peel_for_known_alignment.peel_info.dr = NULL;
1958
1959   if (do_peeling && one_misalignment_known)
1960     {
1961       /* Peeling is possible, but there is no data access that is not supported
1962          unless aligned.  So we try to choose the best possible peeling from
1963          the hash table.  */
1964       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1965         (&peeling_htab, loop_vinfo);
1966     }
1967
1968   /* Compare costs of peeling for known and unknown alignment. */
1969   if (peel_for_known_alignment.peel_info.dr != NULL
1970       && peel_for_unknown_alignment.inside_cost
1971       >= peel_for_known_alignment.inside_cost)
1972     {
1973       best_peel = peel_for_known_alignment;
1974
1975       /* If the best peeling for known alignment has NPEEL == 0, perform no
1976          peeling at all except if there is an unsupportable dr that we can
1977          align.  */
1978       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1979         do_peeling = false;
1980     }
1981
1982   /* If there is an unsupportable data ref, prefer this over all choices so far
1983      since we'd have to discard a chosen peeling except when it accidentally
1984      aligned the unsupportable data ref.  */
1985   if (one_dr_unsupportable)
1986     dr0 = unsupportable_dr;
1987   else if (do_peeling)
1988     {
1989       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1990          TODO: Use nopeel_outside_cost or get rid of it?  */
1991       unsigned nopeel_inside_cost = 0;
1992       unsigned nopeel_outside_cost = 0;
1993
1994       stmt_vector_for_cost dummy;
1995       dummy.create (2);
1996       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1997                                       &nopeel_outside_cost, &dummy, &dummy,
1998                                       0, false);
1999       dummy.release ();
2000
2001       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2002          costs will be recorded.  */
2003       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2004       prologue_cost_vec.create (2);
2005       epilogue_cost_vec.create (2);
2006
2007       int dummy2;
2008       nopeel_outside_cost += vect_get_known_peeling_cost
2009         (loop_vinfo, 0, &dummy2,
2010          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2011          &prologue_cost_vec, &epilogue_cost_vec);
2012
2013       prologue_cost_vec.release ();
2014       epilogue_cost_vec.release ();
2015
2016       npeel = best_peel.peel_info.npeel;
2017       dr0 = best_peel.peel_info.dr;
2018
2019       /* If no peeling is not more expensive than the best peeling we
2020          have so far, don't perform any peeling.  */
2021       if (nopeel_inside_cost <= best_peel.inside_cost)
2022         do_peeling = false;
2023     }
2024
2025   if (do_peeling)
2026     {
2027       stmt = vect_dr_stmt (dr0);
2028       stmt_info = vinfo_for_stmt (stmt);
2029       vectype = STMT_VINFO_VECTYPE (stmt_info);
2030
2031       if (known_alignment_for_access_p (dr0))
2032         {
2033           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2034                                                 size_zero_node) < 0;
2035           if (!npeel)
2036             {
2037               /* Since it's known at compile time, compute the number of
2038                  iterations in the peeled loop (the peeling factor) for use in
2039                  updating DR_MISALIGNMENT values.  The peeling factor is the
2040                  vectorization factor minus the misalignment as an element
2041                  count.  */
2042               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2043               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2044               npeel = ((mis & (target_align - 1))
2045                        / vect_get_scalar_dr_size (dr0));
2046             }
2047
2048           /* For interleaved data access every iteration accesses all the
2049              members of the group, therefore we divide the number of iterations
2050              by the group size.  */
2051           stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2052           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2053             npeel /= DR_GROUP_SIZE (stmt_info);
2054
2055           if (dump_enabled_p ())
2056             dump_printf_loc (MSG_NOTE, vect_location,
2057                              "Try peeling by %d\n", npeel);
2058         }
2059
2060       /* Ensure that all datarefs can be vectorized after the peel.  */
2061       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2062         do_peeling = false;
2063
2064       /* Check if all datarefs are supportable and log.  */
2065       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2066         {
2067           stat = vect_verify_datarefs_alignment (loop_vinfo);
2068           if (!stat)
2069             do_peeling = false;
2070           else
2071             return stat;
2072         }
2073
2074       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2075       if (do_peeling)
2076         {
2077           unsigned max_allowed_peel
2078             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2079           if (max_allowed_peel != (unsigned)-1)
2080             {
2081               unsigned max_peel = npeel;
2082               if (max_peel == 0)
2083                 {
2084                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2085                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2086                 }
2087               if (max_peel > max_allowed_peel)
2088                 {
2089                   do_peeling = false;
2090                   if (dump_enabled_p ())
2091                     dump_printf_loc (MSG_NOTE, vect_location,
2092                         "Disable peeling, max peels reached: %d\n", max_peel);
2093                 }
2094             }
2095         }
2096
2097       /* Cost model #2 - if peeling may result in a remaining loop not
2098          iterating enough to be vectorized then do not peel.  Since this
2099          is a cost heuristic rather than a correctness decision, use the
2100          most likely runtime value for variable vectorization factors.  */
2101       if (do_peeling
2102           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2103         {
2104           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2105           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2106           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2107               < assumed_vf + max_peel)
2108             do_peeling = false;
2109         }
2110
2111       if (do_peeling)
2112         {
2113           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2114              If the misalignment of DR_i is identical to that of dr0 then set
2115              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2116              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2117              by the peeling factor times the element size of DR_i (MOD the
2118              vectorization factor times the size).  Otherwise, the
2119              misalignment of DR_i must be set to unknown.  */
2120           FOR_EACH_VEC_ELT (datarefs, i, dr)
2121             if (dr != dr0)
2122               {
2123                 /* Strided accesses perform only component accesses, alignment
2124                    is irrelevant for them.  */
2125                 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2126                 if (STMT_VINFO_STRIDED_P (stmt_info)
2127                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2128                   continue;
2129
2130                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2131               }
2132
2133           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2134           if (npeel)
2135             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2136           else
2137             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2138               = DR_MISALIGNMENT (dr0);
2139           SET_DR_MISALIGNMENT (dr0, 0);
2140           if (dump_enabled_p ())
2141             {
2142               dump_printf_loc (MSG_NOTE, vect_location,
2143                                "Alignment of access forced using peeling.\n");
2144               dump_printf_loc (MSG_NOTE, vect_location,
2145                                "Peeling for alignment will be applied.\n");
2146             }
2147
2148           /* The inside-loop cost will be accounted for in vectorizable_load
2149              and vectorizable_store correctly with adjusted alignments.
2150              Drop the body_cst_vec on the floor here.  */
2151           stat = vect_verify_datarefs_alignment (loop_vinfo);
2152           gcc_assert (stat);
2153           return stat;
2154         }
2155     }
2156
2157   /* (2) Versioning to force alignment.  */
2158
2159   /* Try versioning if:
2160      1) optimize loop for speed
2161      2) there is at least one unsupported misaligned data ref with an unknown
2162         misalignment, and
2163      3) all misaligned data refs with a known misalignment are supported, and
2164      4) the number of runtime alignment checks is within reason.  */
2165
2166   do_versioning =
2167         optimize_loop_nest_for_speed_p (loop)
2168         && (!loop->inner); /* FORNOW */
2169
2170   if (do_versioning)
2171     {
2172       FOR_EACH_VEC_ELT (datarefs, i, dr)
2173         {
2174           stmt = vect_dr_stmt (dr);
2175           stmt_info = vinfo_for_stmt (stmt);
2176
2177           /* For interleaving, only the alignment of the first access
2178              matters.  */
2179           if (aligned_access_p (dr)
2180               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2181                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2182             continue;
2183
2184           if (STMT_VINFO_STRIDED_P (stmt_info))
2185             {
2186               /* Strided loads perform only component accesses, alignment is
2187                  irrelevant for them.  */
2188               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2189                 continue;
2190               do_versioning = false;
2191               break;
2192             }
2193
2194           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2195
2196           if (!supportable_dr_alignment)
2197             {
2198               gimple *stmt;
2199               int mask;
2200               tree vectype;
2201
2202               if (known_alignment_for_access_p (dr)
2203                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2204                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2205                 {
2206                   do_versioning = false;
2207                   break;
2208                 }
2209
2210               stmt = vect_dr_stmt (dr);
2211               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2212               gcc_assert (vectype);
2213
2214               /* At present we don't support versioning for alignment
2215                  with variable VF, since there's no guarantee that the
2216                  VF is a power of two.  We could relax this if we added
2217                  a way of enforcing a power-of-two size.  */
2218               unsigned HOST_WIDE_INT size;
2219               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2220                 {
2221                   do_versioning = false;
2222                   break;
2223                 }
2224
2225               /* The rightmost bits of an aligned address must be zeros.
2226                  Construct the mask needed for this test.  For example,
2227                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2228                  mask must be 15 = 0xf. */
2229               mask = size - 1;
2230
2231               /* FORNOW: use the same mask to test all potentially unaligned
2232                  references in the loop.  The vectorizer currently supports
2233                  a single vector size, see the reference to
2234                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2235                  vectorization factor is computed.  */
2236               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2237                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2238               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2239               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2240                       vect_dr_stmt (dr));
2241             }
2242         }
2243
2244       /* Versioning requires at least one misaligned data reference.  */
2245       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2246         do_versioning = false;
2247       else if (!do_versioning)
2248         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2249     }
2250
2251   if (do_versioning)
2252     {
2253       vec<gimple *> may_misalign_stmts
2254         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2255       gimple *stmt;
2256
2257       /* It can now be assumed that the data references in the statements
2258          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2259          of the loop being vectorized.  */
2260       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2261         {
2262           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2263           dr = STMT_VINFO_DATA_REF (stmt_info);
2264           SET_DR_MISALIGNMENT (dr, 0);
2265           if (dump_enabled_p ())
2266             dump_printf_loc (MSG_NOTE, vect_location,
2267                              "Alignment of access forced using versioning.\n");
2268         }
2269
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_NOTE, vect_location,
2272                          "Versioning for alignment will be applied.\n");
2273
2274       /* Peeling and versioning can't be done together at this time.  */
2275       gcc_assert (! (do_peeling && do_versioning));
2276
2277       stat = vect_verify_datarefs_alignment (loop_vinfo);
2278       gcc_assert (stat);
2279       return stat;
2280     }
2281
2282   /* This point is reached if neither peeling nor versioning is being done.  */
2283   gcc_assert (! (do_peeling || do_versioning));
2284
2285   stat = vect_verify_datarefs_alignment (loop_vinfo);
2286   return stat;
2287 }
2288
2289
2290 /* Function vect_find_same_alignment_drs.
2291
2292    Update group and alignment relations according to the chosen
2293    vectorization factor.  */
2294
2295 static void
2296 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2297 {
2298   struct data_reference *dra = DDR_A (ddr);
2299   struct data_reference *drb = DDR_B (ddr);
2300   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2301   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2302
2303   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2304     return;
2305
2306   if (dra == drb)
2307     return;
2308
2309   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2310       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2311     return;
2312
2313   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2314       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2315       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2316     return;
2317
2318   /* Two references with distance zero have the same alignment.  */
2319   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2320                           - wi::to_poly_offset (DR_INIT (drb)));
2321   if (maybe_ne (diff, 0))
2322     {
2323       /* Get the wider of the two alignments.  */
2324       unsigned int align_a = (vect_calculate_target_alignment (dra)
2325                               / BITS_PER_UNIT);
2326       unsigned int align_b = (vect_calculate_target_alignment (drb)
2327                               / BITS_PER_UNIT);
2328       unsigned int max_align = MAX (align_a, align_b);
2329
2330       /* Require the gap to be a multiple of the larger vector alignment.  */
2331       if (!multiple_p (diff, max_align))
2332         return;
2333     }
2334
2335   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2336   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2337   if (dump_enabled_p ())
2338     {
2339       dump_printf_loc (MSG_NOTE, vect_location,
2340                        "accesses have the same alignment: ");
2341       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2342       dump_printf (MSG_NOTE,  " and ");
2343       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2344       dump_printf (MSG_NOTE, "\n");
2345     }
2346 }
2347
2348
2349 /* Function vect_analyze_data_refs_alignment
2350
2351    Analyze the alignment of the data-references in the loop.
2352    Return FALSE if a data reference is found that cannot be vectorized.  */
2353
2354 bool
2355 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2356 {
2357   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2358
2359   /* Mark groups of data references with same alignment using
2360      data dependence information.  */
2361   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2362   struct data_dependence_relation *ddr;
2363   unsigned int i;
2364
2365   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2366     vect_find_same_alignment_drs (ddr);
2367
2368   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2369   struct data_reference *dr;
2370
2371   vect_record_base_alignments (vinfo);
2372   FOR_EACH_VEC_ELT (datarefs, i, dr)
2373     {
2374       stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2375       if (STMT_VINFO_VECTORIZABLE (stmt_info))
2376         vect_compute_data_ref_alignment (dr);
2377     }
2378
2379   return true;
2380 }
2381
2382
2383 /* Analyze alignment of DRs of stmts in NODE.  */
2384
2385 static bool
2386 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2387 {
2388   /* We vectorize from the first scalar stmt in the node unless
2389      the node is permuted in which case we start from the first
2390      element in the group.  */
2391   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2392   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2393   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2394     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2395
2396   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2397   vect_compute_data_ref_alignment (dr);
2398   /* For creating the data-ref pointer we need alignment of the
2399      first element anyway.  */
2400   if (dr != first_dr)
2401     vect_compute_data_ref_alignment (first_dr);
2402   if (! verify_data_ref_alignment (dr))
2403     {
2404       if (dump_enabled_p ())
2405         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406                          "not vectorized: bad data alignment in basic "
2407                          "block.\n");
2408       return false;
2409     }
2410
2411   return true;
2412 }
2413
2414 /* Function vect_slp_analyze_instance_alignment
2415
2416    Analyze the alignment of the data-references in the SLP instance.
2417    Return FALSE if a data reference is found that cannot be vectorized.  */
2418
2419 bool
2420 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2421 {
2422   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2423
2424   slp_tree node;
2425   unsigned i;
2426   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2427     if (! vect_slp_analyze_and_verify_node_alignment (node))
2428       return false;
2429
2430   node = SLP_INSTANCE_TREE (instance);
2431   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2432       && ! vect_slp_analyze_and_verify_node_alignment
2433              (SLP_INSTANCE_TREE (instance)))
2434     return false;
2435
2436   return true;
2437 }
2438
2439
2440 /* Analyze groups of accesses: check that DR belongs to a group of
2441    accesses of legal size, step, etc.  Detect gaps, single element
2442    interleaving, and other special cases. Set grouped access info.
2443    Collect groups of strided stores for further use in SLP analysis.
2444    Worker for vect_analyze_group_access.  */
2445
2446 static bool
2447 vect_analyze_group_access_1 (struct data_reference *dr)
2448 {
2449   tree step = DR_STEP (dr);
2450   tree scalar_type = TREE_TYPE (DR_REF (dr));
2451   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2452   gimple *stmt = vect_dr_stmt (dr);
2453   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2454   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2455   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2456   HOST_WIDE_INT dr_step = -1;
2457   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2458   bool slp_impossible = false;
2459
2460   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2461      size of the interleaving group (including gaps).  */
2462   if (tree_fits_shwi_p (step))
2463     {
2464       dr_step = tree_to_shwi (step);
2465       /* Check that STEP is a multiple of type size.  Otherwise there is
2466          a non-element-sized gap at the end of the group which we
2467          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2468          ???  As we can handle non-constant step fine here we should
2469          simply remove uses of DR_GROUP_GAP between the last and first
2470          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2471          simply not include that gap.  */
2472       if ((dr_step % type_size) != 0)
2473         {
2474           if (dump_enabled_p ())
2475             {
2476               dump_printf_loc (MSG_NOTE, vect_location,
2477                                "Step ");
2478               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2479               dump_printf (MSG_NOTE,
2480                            " is not a multiple of the element size for ");
2481               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2482               dump_printf (MSG_NOTE, "\n");
2483             }
2484           return false;
2485         }
2486       groupsize = absu_hwi (dr_step) / type_size;
2487     }
2488   else
2489     groupsize = 0;
2490
2491   /* Not consecutive access is possible only if it is a part of interleaving.  */
2492   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2493     {
2494       /* Check if it this DR is a part of interleaving, and is a single
2495          element of the group that is accessed in the loop.  */
2496
2497       /* Gaps are supported only for loads. STEP must be a multiple of the type
2498          size.  */
2499       if (DR_IS_READ (dr)
2500           && (dr_step % type_size) == 0
2501           && groupsize > 0)
2502         {
2503           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2504           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2505           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2506           if (dump_enabled_p ())
2507             {
2508               dump_printf_loc (MSG_NOTE, vect_location,
2509                                "Detected single element interleaving ");
2510               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2511               dump_printf (MSG_NOTE, " step ");
2512               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2513               dump_printf (MSG_NOTE, "\n");
2514             }
2515
2516           return true;
2517         }
2518
2519       if (dump_enabled_p ())
2520         {
2521           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2522                            "not consecutive access ");
2523           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2524         }
2525
2526       if (bb_vinfo)
2527         {
2528           /* Mark the statement as unvectorizable.  */
2529           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2530           return true;
2531         }
2532
2533       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2534       STMT_VINFO_STRIDED_P (stmt_info) = true;
2535       return true;
2536     }
2537
2538   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2539     {
2540       /* First stmt in the interleaving chain. Check the chain.  */
2541       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2542       struct data_reference *data_ref = dr;
2543       unsigned int count = 1;
2544       tree prev_init = DR_INIT (data_ref);
2545       gimple *prev = stmt;
2546       HOST_WIDE_INT diff, gaps = 0;
2547
2548       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2549       while (next)
2550         {
2551           /* Skip same data-refs.  In case that two or more stmts share
2552              data-ref (supported only for loads), we vectorize only the first
2553              stmt, and the rest get their vectorized loads from the first
2554              one.  */
2555           if (!tree_int_cst_compare (DR_INIT (data_ref),
2556                                      DR_INIT (STMT_VINFO_DATA_REF (
2557                                                    vinfo_for_stmt (next)))))
2558             {
2559               if (DR_IS_WRITE (data_ref))
2560                 {
2561                   if (dump_enabled_p ())
2562                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2563                                      "Two store stmts share the same dr.\n");
2564                   return false;
2565                 }
2566
2567               if (dump_enabled_p ())
2568                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569                                  "Two or more load stmts share the same dr.\n");
2570
2571               /* For load use the same data-ref load.  */
2572               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2573
2574               prev = next;
2575               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2576               continue;
2577             }
2578
2579           prev = next;
2580           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2581
2582           /* All group members have the same STEP by construction.  */
2583           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2584
2585           /* Check that the distance between two accesses is equal to the type
2586              size. Otherwise, we have gaps.  */
2587           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2588                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2589           if (diff != 1)
2590             {
2591               /* FORNOW: SLP of accesses with gaps is not supported.  */
2592               slp_impossible = true;
2593               if (DR_IS_WRITE (data_ref))
2594                 {
2595                   if (dump_enabled_p ())
2596                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2597                                      "interleaved store with gaps\n");
2598                   return false;
2599                 }
2600
2601               gaps += diff - 1;
2602             }
2603
2604           last_accessed_element += diff;
2605
2606           /* Store the gap from the previous member of the group. If there is no
2607              gap in the access, DR_GROUP_GAP is always 1.  */
2608           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2609
2610           prev_init = DR_INIT (data_ref);
2611           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2612           /* Count the number of data-refs in the chain.  */
2613           count++;
2614         }
2615
2616       if (groupsize == 0)
2617         groupsize = count + gaps;
2618
2619       /* This could be UINT_MAX but as we are generating code in a very
2620          inefficient way we have to cap earlier.  See PR78699 for example.  */
2621       if (groupsize > 4096)
2622         {
2623           if (dump_enabled_p ())
2624             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2625                              "group is too large\n");
2626           return false;
2627         }
2628
2629       /* Check that the size of the interleaving is equal to count for stores,
2630          i.e., that there are no gaps.  */
2631       if (groupsize != count
2632           && !DR_IS_READ (dr))
2633         {
2634           if (dump_enabled_p ())
2635             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636                              "interleaved store with gaps\n");
2637           return false;
2638         }
2639
2640       /* If there is a gap after the last load in the group it is the
2641          difference between the groupsize and the last accessed
2642          element.
2643          When there is no gap, this difference should be 0.  */
2644       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2645
2646       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2647       if (dump_enabled_p ())
2648         {
2649           dump_printf_loc (MSG_NOTE, vect_location,
2650                            "Detected interleaving ");
2651           if (DR_IS_READ (dr))
2652             dump_printf (MSG_NOTE, "load ");
2653           else
2654             dump_printf (MSG_NOTE, "store ");
2655           dump_printf (MSG_NOTE, "of size %u starting with ",
2656                        (unsigned)groupsize);
2657           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2658           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2659             dump_printf_loc (MSG_NOTE, vect_location,
2660                              "There is a gap of %u elements after the group\n",
2661                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2662         }
2663
2664       /* SLP: create an SLP data structure for every interleaving group of
2665          stores for further analysis in vect_analyse_slp.  */
2666       if (DR_IS_WRITE (dr) && !slp_impossible)
2667         {
2668           if (loop_vinfo)
2669             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2670           if (bb_vinfo)
2671             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2672         }
2673     }
2674
2675   return true;
2676 }
2677
2678 /* Analyze groups of accesses: check that DR belongs to a group of
2679    accesses of legal size, step, etc.  Detect gaps, single element
2680    interleaving, and other special cases. Set grouped access info.
2681    Collect groups of strided stores for further use in SLP analysis.  */
2682
2683 static bool
2684 vect_analyze_group_access (struct data_reference *dr)
2685 {
2686   if (!vect_analyze_group_access_1 (dr))
2687     {
2688       /* Dissolve the group if present.  */
2689       gimple *next;
2690       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2691       while (stmt)
2692         {
2693           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2694           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2695           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2696           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2697           stmt = next;
2698         }
2699       return false;
2700     }
2701   return true;
2702 }
2703
2704 /* Analyze the access pattern of the data-reference DR.
2705    In case of non-consecutive accesses call vect_analyze_group_access() to
2706    analyze groups of accesses.  */
2707
2708 static bool
2709 vect_analyze_data_ref_access (struct data_reference *dr)
2710 {
2711   tree step = DR_STEP (dr);
2712   tree scalar_type = TREE_TYPE (DR_REF (dr));
2713   gimple *stmt = vect_dr_stmt (dr);
2714   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2715   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716   struct loop *loop = NULL;
2717
2718   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719     return true;
2720
2721   if (loop_vinfo)
2722     loop = LOOP_VINFO_LOOP (loop_vinfo);
2723
2724   if (loop_vinfo && !step)
2725     {
2726       if (dump_enabled_p ())
2727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728                          "bad data-ref access in loop\n");
2729       return false;
2730     }
2731
2732   /* Allow loads with zero step in inner-loop vectorization.  */
2733   if (loop_vinfo && integer_zerop (step))
2734     {
2735       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2736       if (!nested_in_vect_loop_p (loop, stmt))
2737         return DR_IS_READ (dr);
2738       /* Allow references with zero step for outer loops marked
2739          with pragma omp simd only - it guarantees absence of
2740          loop-carried dependencies between inner loop iterations.  */
2741       if (loop->safelen < 2)
2742         {
2743           if (dump_enabled_p ())
2744             dump_printf_loc (MSG_NOTE, vect_location,
2745                              "zero step in inner loop of nest\n");
2746           return false;
2747         }
2748     }
2749
2750   if (loop && nested_in_vect_loop_p (loop, stmt))
2751     {
2752       /* Interleaved accesses are not yet supported within outer-loop
2753         vectorization for references in the inner-loop.  */
2754       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2755
2756       /* For the rest of the analysis we use the outer-loop step.  */
2757       step = STMT_VINFO_DR_STEP (stmt_info);
2758       if (integer_zerop (step))
2759         {
2760           if (dump_enabled_p ())
2761             dump_printf_loc (MSG_NOTE, vect_location,
2762                              "zero step in outer loop.\n");
2763           return DR_IS_READ (dr);
2764         }
2765     }
2766
2767   /* Consecutive?  */
2768   if (TREE_CODE (step) == INTEGER_CST)
2769     {
2770       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772           || (dr_step < 0
2773               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2774         {
2775           /* Mark that it is not interleaving.  */
2776           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2777           return true;
2778         }
2779     }
2780
2781   if (loop && nested_in_vect_loop_p (loop, stmt))
2782     {
2783       if (dump_enabled_p ())
2784         dump_printf_loc (MSG_NOTE, vect_location,
2785                          "grouped access in outer loop.\n");
2786       return false;
2787     }
2788
2789
2790   /* Assume this is a DR handled by non-constant strided load case.  */
2791   if (TREE_CODE (step) != INTEGER_CST)
2792     return (STMT_VINFO_STRIDED_P (stmt_info)
2793             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794                 || vect_analyze_group_access (dr)));
2795
2796   /* Not consecutive access - check if it's a part of interleaving group.  */
2797   return vect_analyze_group_access (dr);
2798 }
2799
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801    suitable for grouping.  */
2802
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2805 {
2806   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808   int cmp;
2809
2810   /* Stabilize sort.  */
2811   if (dra == drb)
2812     return 0;
2813
2814   /* DRs in different loops never belong to the same group.  */
2815   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817   if (loopa != loopb)
2818     return loopa->num < loopb->num ? -1 : 1;
2819
2820   /* Ordering of DRs according to base.  */
2821   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822                                DR_BASE_ADDRESS (drb));
2823   if (cmp != 0)
2824     return cmp;
2825
2826   /* And according to DR_OFFSET.  */
2827   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828   if (cmp != 0)
2829     return cmp;
2830
2831   /* Put reads before writes.  */
2832   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833     return DR_IS_READ (dra) ? -1 : 1;
2834
2835   /* Then sort after access size.  */
2836   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838   if (cmp != 0)
2839     return cmp;
2840
2841   /* And after step.  */
2842   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843   if (cmp != 0)
2844     return cmp;
2845
2846   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2847   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848   if (cmp == 0)
2849     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850   return cmp;
2851 }
2852
2853 /* If OP is the result of a conversion, return the unconverted value,
2854    otherwise return null.  */
2855
2856 static tree
2857 strip_conversion (tree op)
2858 {
2859   if (TREE_CODE (op) != SSA_NAME)
2860     return NULL_TREE;
2861   gimple *stmt = SSA_NAME_DEF_STMT (op);
2862   if (!is_gimple_assign (stmt)
2863       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864     return NULL_TREE;
2865   return gimple_assign_rhs1 (stmt);
2866 }
2867
2868 /* Return true if vectorizable_* routines can handle statements STMT1
2869    and STMT2 being in a single group.  */
2870
2871 static bool
2872 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2873 {
2874   if (gimple_assign_single_p (stmt1))
2875     return gimple_assign_single_p (stmt2);
2876
2877   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2878     {
2879       /* Check for two masked loads or two masked stores.  */
2880       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2881         return false;
2882       internal_fn ifn = gimple_call_internal_fn (stmt1);
2883       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2884         return false;
2885       if (ifn != gimple_call_internal_fn (stmt2))
2886         return false;
2887
2888       /* Check that the masks are the same.  Cope with casts of masks,
2889          like those created by build_mask_conversion.  */
2890       tree mask1 = gimple_call_arg (stmt1, 2);
2891       tree mask2 = gimple_call_arg (stmt2, 2);
2892       if (!operand_equal_p (mask1, mask2, 0))
2893         {
2894           mask1 = strip_conversion (mask1);
2895           if (!mask1)
2896             return false;
2897           mask2 = strip_conversion (mask2);
2898           if (!mask2)
2899             return false;
2900           if (!operand_equal_p (mask1, mask2, 0))
2901             return false;
2902         }
2903       return true;
2904     }
2905
2906   return false;
2907 }
2908
2909 /* Function vect_analyze_data_ref_accesses.
2910
2911    Analyze the access pattern of all the data references in the loop.
2912
2913    FORNOW: the only access pattern that is considered vectorizable is a
2914            simple step 1 (consecutive) access.
2915
2916    FORNOW: handle only arrays and pointer accesses.  */
2917
2918 bool
2919 vect_analyze_data_ref_accesses (vec_info *vinfo)
2920 {
2921   unsigned int i;
2922   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2923   struct data_reference *dr;
2924
2925   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2926
2927   if (datarefs.is_empty ())
2928     return true;
2929
2930   /* Sort the array of datarefs to make building the interleaving chains
2931      linear.  Don't modify the original vector's order, it is needed for
2932      determining what dependencies are reversed.  */
2933   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2934   datarefs_copy.qsort (dr_group_sort_cmp);
2935
2936   /* Build the interleaving chains.  */
2937   for (i = 0; i < datarefs_copy.length () - 1;)
2938     {
2939       data_reference_p dra = datarefs_copy[i];
2940       stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2941       stmt_vec_info lastinfo = NULL;
2942       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2943           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2944         {
2945           ++i;
2946           continue;
2947         }
2948       for (i = i + 1; i < datarefs_copy.length (); ++i)
2949         {
2950           data_reference_p drb = datarefs_copy[i];
2951           stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2952           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2953               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2954             break;
2955
2956           /* ???  Imperfect sorting (non-compatible types, non-modulo
2957              accesses, same accesses) can lead to a group to be artificially
2958              split here as we don't just skip over those.  If it really
2959              matters we can push those to a worklist and re-iterate
2960              over them.  The we can just skip ahead to the next DR here.  */
2961
2962           /* DRs in a different loop should not be put into the same
2963              interleaving group.  */
2964           if (gimple_bb (DR_STMT (dra))->loop_father
2965               != gimple_bb (DR_STMT (drb))->loop_father)
2966             break;
2967
2968           /* Check that the data-refs have same first location (except init)
2969              and they are both either store or load (not load and store,
2970              not masked loads or stores).  */
2971           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2972               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2973                                         DR_BASE_ADDRESS (drb)) != 0
2974               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2975               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2976             break;
2977
2978           /* Check that the data-refs have the same constant size.  */
2979           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2980           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2981           if (!tree_fits_uhwi_p (sza)
2982               || !tree_fits_uhwi_p (szb)
2983               || !tree_int_cst_equal (sza, szb))
2984             break;
2985
2986           /* Check that the data-refs have the same step.  */
2987           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2988             break;
2989
2990           /* Check the types are compatible.
2991              ???  We don't distinguish this during sorting.  */
2992           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2993                                    TREE_TYPE (DR_REF (drb))))
2994             break;
2995
2996           /* Check that the DR_INITs are compile-time constants.  */
2997           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2998               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2999             break;
3000
3001           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3002           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3003           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3004           HOST_WIDE_INT init_prev
3005             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3006           gcc_assert (init_a <= init_b
3007                       && init_a <= init_prev
3008                       && init_prev <= init_b);
3009
3010           /* Do not place the same access in the interleaving chain twice.  */
3011           if (init_b == init_prev)
3012             {
3013               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3014                           < gimple_uid (DR_STMT (drb)));
3015               /* ???  For now we simply "drop" the later reference which is
3016                  otherwise the same rather than finishing off this group.
3017                  In the end we'd want to re-process duplicates forming
3018                  multiple groups from the refs, likely by just collecting
3019                  all candidates (including duplicates and split points
3020                  below) in a vector and then process them together.  */
3021               continue;
3022             }
3023
3024           /* If init_b == init_a + the size of the type * k, we have an
3025              interleaving, and DRA is accessed before DRB.  */
3026           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3027           if (type_size_a == 0
3028               || (init_b - init_a) % type_size_a != 0)
3029             break;
3030
3031           /* If we have a store, the accesses are adjacent.  This splits
3032              groups into chunks we support (we don't support vectorization
3033              of stores with gaps).  */
3034           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3035             break;
3036
3037           /* If the step (if not zero or non-constant) is greater than the
3038              difference between data-refs' inits this splits groups into
3039              suitable sizes.  */
3040           if (tree_fits_shwi_p (DR_STEP (dra)))
3041             {
3042               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3043               if (step != 0 && step <= (init_b - init_a))
3044                 break;
3045             }
3046
3047           if (dump_enabled_p ())
3048             {
3049               dump_printf_loc (MSG_NOTE, vect_location,
3050                                "Detected interleaving ");
3051               if (DR_IS_READ (dra))
3052                 dump_printf (MSG_NOTE, "load ");
3053               else
3054                 dump_printf (MSG_NOTE, "store ");
3055               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3056               dump_printf (MSG_NOTE,  " and ");
3057               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3058               dump_printf (MSG_NOTE, "\n");
3059             }
3060
3061           /* Link the found element into the group list.  */
3062           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3063             {
3064               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3065               lastinfo = stmtinfo_a;
3066             }
3067           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3068           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3069           lastinfo = stmtinfo_b;
3070         }
3071     }
3072
3073   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3074     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3075         && !vect_analyze_data_ref_access (dr))
3076       {
3077         if (dump_enabled_p ())
3078           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3079                            "not vectorized: complicated access pattern.\n");
3080
3081         if (is_a <bb_vec_info> (vinfo))
3082           {
3083             /* Mark the statement as not vectorizable.  */
3084             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3085             continue;
3086           }
3087         else
3088           {
3089             datarefs_copy.release ();
3090             return false;
3091           }
3092       }
3093
3094   datarefs_copy.release ();
3095   return true;
3096 }
3097
3098 /* Function vect_vfa_segment_size.
3099
3100    Input:
3101      DR: The data reference.
3102      LENGTH_FACTOR: segment length to consider.
3103
3104    Return a value suitable for the dr_with_seg_len::seg_len field.
3105    This is the "distance travelled" by the pointer from the first
3106    iteration in the segment to the last.  Note that it does not include
3107    the size of the access; in effect it only describes the first byte.  */
3108
3109 static tree
3110 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3111 {
3112   length_factor = size_binop (MINUS_EXPR,
3113                               fold_convert (sizetype, length_factor),
3114                               size_one_node);
3115   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3116                      length_factor);
3117 }
3118
3119 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3120    gives the worst-case number of bytes covered by the segment.  */
3121
3122 static unsigned HOST_WIDE_INT
3123 vect_vfa_access_size (data_reference *dr)
3124 {
3125   stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3126   tree ref_type = TREE_TYPE (DR_REF (dr));
3127   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3128   unsigned HOST_WIDE_INT access_size = ref_size;
3129   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3130     {
3131       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3132       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3133     }
3134   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3135       && (vect_supportable_dr_alignment (dr, false)
3136           == dr_explicit_realign_optimized))
3137     {
3138       /* We might access a full vector's worth.  */
3139       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3140       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3141     }
3142   return access_size;
3143 }
3144
3145 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3146
3147 static unsigned int
3148 vect_vfa_align (const data_reference *dr)
3149 {
3150   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3151 }
3152
3153 /* Function vect_no_alias_p.
3154
3155    Given data references A and B with equal base and offset, see whether
3156    the alias relation can be decided at compilation time.  Return 1 if
3157    it can and the references alias, 0 if it can and the references do
3158    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3159    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3160    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3161
3162 static int
3163 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3164                          tree segment_length_a, tree segment_length_b,
3165                          unsigned HOST_WIDE_INT access_size_a,
3166                          unsigned HOST_WIDE_INT access_size_b)
3167 {
3168   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3169   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3170   poly_uint64 const_length_a;
3171   poly_uint64 const_length_b;
3172
3173   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3174      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3175      [a, a+12) */
3176   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3177     {
3178       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3179       offset_a = (offset_a + access_size_a) - const_length_a;
3180     }
3181   else
3182     const_length_a = tree_to_poly_uint64 (segment_length_a);
3183   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3184     {
3185       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3186       offset_b = (offset_b + access_size_b) - const_length_b;
3187     }
3188   else
3189     const_length_b = tree_to_poly_uint64 (segment_length_b);
3190
3191   const_length_a += access_size_a;
3192   const_length_b += access_size_b;
3193
3194   if (ranges_known_overlap_p (offset_a, const_length_a,
3195                               offset_b, const_length_b))
3196     return 1;
3197
3198   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3199                                offset_b, const_length_b))
3200     return 0;
3201
3202   return -1;
3203 }
3204
3205 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3206    in DDR is >= VF.  */
3207
3208 static bool
3209 dependence_distance_ge_vf (data_dependence_relation *ddr,
3210                            unsigned int loop_depth, poly_uint64 vf)
3211 {
3212   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3213       || DDR_NUM_DIST_VECTS (ddr) == 0)
3214     return false;
3215
3216   /* If the dependence is exact, we should have limited the VF instead.  */
3217   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3218
3219   unsigned int i;
3220   lambda_vector dist_v;
3221   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3222     {
3223       HOST_WIDE_INT dist = dist_v[loop_depth];
3224       if (dist != 0
3225           && !(dist > 0 && DDR_REVERSED_P (ddr))
3226           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3227         return false;
3228     }
3229
3230   if (dump_enabled_p ())
3231     {
3232       dump_printf_loc (MSG_NOTE, vect_location,
3233                        "dependence distance between ");
3234       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3235       dump_printf (MSG_NOTE,  " and ");
3236       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3237       dump_printf (MSG_NOTE,  " is >= VF\n");
3238     }
3239
3240   return true;
3241 }
3242
3243 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3244
3245 static void
3246 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3247 {
3248   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3249   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3250   dump_printf (dump_kind, ") >= ");
3251   dump_dec (dump_kind, lower_bound.min_value);
3252 }
3253
3254 /* Record that the vectorized loop requires the vec_lower_bound described
3255    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3256
3257 static void
3258 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3259                         poly_uint64 min_value)
3260 {
3261   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3262   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3263     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3264       {
3265         unsigned_p &= lower_bounds[i].unsigned_p;
3266         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3267         if (lower_bounds[i].unsigned_p != unsigned_p
3268             || maybe_lt (lower_bounds[i].min_value, min_value))
3269           {
3270             lower_bounds[i].unsigned_p = unsigned_p;
3271             lower_bounds[i].min_value = min_value;
3272             if (dump_enabled_p ())
3273               {
3274                 dump_printf_loc (MSG_NOTE, vect_location,
3275                                  "updating run-time check to ");
3276                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3277                 dump_printf (MSG_NOTE, "\n");
3278               }
3279           }
3280         return;
3281       }
3282
3283   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3284   if (dump_enabled_p ())
3285     {
3286       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3287       dump_lower_bound (MSG_NOTE, lower_bound);
3288       dump_printf (MSG_NOTE, "\n");
3289     }
3290   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3291 }
3292
3293 /* Return true if it's unlikely that the step of the vectorized form of DR
3294    will span fewer than GAP bytes.  */
3295
3296 static bool
3297 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3298 {
3299   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3300   HOST_WIDE_INT count
3301     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3302   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3303     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3304   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3305 }
3306
3307 /* Return true if we know that there is no alias between DR_A and DR_B
3308    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3309    *LOWER_BOUND_OUT to this N.  */
3310
3311 static bool
3312 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3313                                 poly_uint64 *lower_bound_out)
3314 {
3315   /* Check that there is a constant gap of known sign between DR_A
3316      and DR_B.  */
3317   poly_int64 init_a, init_b;
3318   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3319       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3320       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3321       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3322       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3323       || !ordered_p (init_a, init_b))
3324     return false;
3325
3326   /* Sort DR_A and DR_B by the address they access.  */
3327   if (maybe_lt (init_b, init_a))
3328     {
3329       std::swap (init_a, init_b);
3330       std::swap (dr_a, dr_b);
3331     }
3332
3333   /* If the two accesses could be dependent within a scalar iteration,
3334      make sure that we'd retain their order.  */
3335   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3336       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3337                                          vect_dr_stmt (dr_b)))
3338     return false;
3339
3340   /* There is no alias if abs (DR_STEP) is greater than or equal to
3341      the bytes spanned by the combination of the two accesses.  */
3342   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3343   return true;
3344 }
3345
3346 /* Function vect_prune_runtime_alias_test_list.
3347
3348    Prune a list of ddrs to be tested at run-time by versioning for alias.
3349    Merge several alias checks into one if possible.
3350    Return FALSE if resulting list of ddrs is longer then allowed by
3351    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3352
3353 bool
3354 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3355 {
3356   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3357   hash_set <tree_pair_hash> compared_objects;
3358
3359   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3360   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3361     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3362   vec<vec_object_pair> &check_unequal_addrs
3363     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3364   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3365   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3366
3367   ddr_p ddr;
3368   unsigned int i;
3369   tree length_factor;
3370
3371   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3372
3373   /* Step values are irrelevant for aliasing if the number of vector
3374      iterations is equal to the number of scalar iterations (which can
3375      happen for fully-SLP loops).  */
3376   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3377
3378   if (!ignore_step_p)
3379     {
3380       /* Convert the checks for nonzero steps into bound tests.  */
3381       tree value;
3382       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3383         vect_check_lower_bound (loop_vinfo, value, true, 1);
3384     }
3385
3386   if (may_alias_ddrs.is_empty ())
3387     return true;
3388
3389   comp_alias_ddrs.create (may_alias_ddrs.length ());
3390
3391   unsigned int loop_depth
3392     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3393                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3394
3395   /* First, we collect all data ref pairs for aliasing checks.  */
3396   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3397     {
3398       int comp_res;
3399       poly_uint64 lower_bound;
3400       struct data_reference *dr_a, *dr_b;
3401       gimple *dr_group_first_a, *dr_group_first_b;
3402       tree segment_length_a, segment_length_b;
3403       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3404       unsigned int align_a, align_b;
3405       gimple *stmt_a, *stmt_b;
3406
3407       /* Ignore the alias if the VF we chose ended up being no greater
3408          than the dependence distance.  */
3409       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3410         continue;
3411
3412       if (DDR_OBJECT_A (ddr))
3413         {
3414           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3415           if (!compared_objects.add (new_pair))
3416             {
3417               if (dump_enabled_p ())
3418                 {
3419                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3420                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3421                   dump_printf (MSG_NOTE, " and ");
3422                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3423                   dump_printf (MSG_NOTE, " have different addresses\n");
3424                 }
3425               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3426             }
3427           continue;
3428         }
3429
3430       dr_a = DDR_A (ddr);
3431       stmt_a = vect_dr_stmt (DDR_A (ddr));
3432
3433       dr_b = DDR_B (ddr);
3434       stmt_b = vect_dr_stmt (DDR_B (ddr));
3435
3436       /* Skip the pair if inter-iteration dependencies are irrelevant
3437          and intra-iteration dependencies are guaranteed to be honored.  */
3438       if (ignore_step_p
3439           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3440               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3441         {
3442           if (dump_enabled_p ())
3443             {
3444               dump_printf_loc (MSG_NOTE, vect_location,
3445                                "no need for alias check between ");
3446               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3447               dump_printf (MSG_NOTE, " and ");
3448               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3449               dump_printf (MSG_NOTE, " when VF is 1\n");
3450             }
3451           continue;
3452         }
3453
3454       /* See whether we can handle the alias using a bounds check on
3455          the step, and whether that's likely to be the best approach.
3456          (It might not be, for example, if the minimum step is much larger
3457          than the number of bytes handled by one vector iteration.)  */
3458       if (!ignore_step_p
3459           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3460           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3461           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3462               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3463         {
3464           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3465           if (dump_enabled_p ())
3466             {
3467               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3468               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3469               dump_printf (MSG_NOTE, " and ");
3470               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3471               dump_printf (MSG_NOTE, " when the step ");
3472               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3473               dump_printf (MSG_NOTE, " is outside ");
3474               if (unsigned_p)
3475                 dump_printf (MSG_NOTE, "[0");
3476               else
3477                 {
3478                   dump_printf (MSG_NOTE, "(");
3479                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3480                 }
3481               dump_printf (MSG_NOTE, ", ");
3482               dump_dec (MSG_NOTE, lower_bound);
3483               dump_printf (MSG_NOTE, ")\n");
3484             }
3485           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3486                                   lower_bound);
3487           continue;
3488         }
3489
3490       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3491       if (dr_group_first_a)
3492         {
3493           stmt_a = dr_group_first_a;
3494           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3495         }
3496
3497       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3498       if (dr_group_first_b)
3499         {
3500           stmt_b = dr_group_first_b;
3501           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3502         }
3503
3504       if (ignore_step_p)
3505         {
3506           segment_length_a = size_zero_node;
3507           segment_length_b = size_zero_node;
3508         }
3509       else
3510         {
3511           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3512             length_factor = scalar_loop_iters;
3513           else
3514             length_factor = size_int (vect_factor);
3515           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3516           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3517         }
3518       access_size_a = vect_vfa_access_size (dr_a);
3519       access_size_b = vect_vfa_access_size (dr_b);
3520       align_a = vect_vfa_align (dr_a);
3521       align_b = vect_vfa_align (dr_b);
3522
3523       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3524                                         DR_BASE_ADDRESS (dr_b));
3525       if (comp_res == 0)
3526         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3527                                           DR_OFFSET (dr_b));
3528
3529       /* See whether the alias is known at compilation time.  */
3530       if (comp_res == 0
3531           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3532           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3533           && poly_int_tree_p (segment_length_a)
3534           && poly_int_tree_p (segment_length_b))
3535         {
3536           int res = vect_compile_time_alias (dr_a, dr_b,
3537                                              segment_length_a,
3538                                              segment_length_b,
3539                                              access_size_a,
3540                                              access_size_b);
3541           if (res >= 0 && dump_enabled_p ())
3542             {
3543               dump_printf_loc (MSG_NOTE, vect_location,
3544                                "can tell at compile time that ");
3545               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3546               dump_printf (MSG_NOTE, " and ");
3547               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3548               if (res == 0)
3549                 dump_printf (MSG_NOTE, " do not alias\n");
3550               else
3551                 dump_printf (MSG_NOTE, " alias\n");
3552             }
3553
3554           if (res == 0)
3555             continue;
3556
3557           if (res == 1)
3558             {
3559               if (dump_enabled_p ())
3560                 dump_printf_loc (MSG_NOTE, vect_location,
3561                                  "not vectorized: compilation time alias.\n");
3562               return false;
3563             }
3564         }
3565
3566       dr_with_seg_len_pair_t dr_with_seg_len_pair
3567         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3568          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3569
3570       /* Canonicalize pairs by sorting the two DR members.  */
3571       if (comp_res > 0)
3572         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3573
3574       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3575     }
3576
3577   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3578
3579   unsigned int count = (comp_alias_ddrs.length ()
3580                         + check_unequal_addrs.length ());
3581
3582   dump_printf_loc (MSG_NOTE, vect_location,
3583                    "improved number of alias checks from %d to %d\n",
3584                    may_alias_ddrs.length (), count);
3585   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3586     {
3587       if (dump_enabled_p ())
3588         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589                          "number of versioning for alias "
3590                          "run-time tests exceeds %d "
3591                          "(--param vect-max-version-for-alias-checks)\n",
3592                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3593       return false;
3594     }
3595
3596   return true;
3597 }
3598
3599 /* Check whether we can use an internal function for a gather load
3600    or scatter store.  READ_P is true for loads and false for stores.
3601    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3602    the type of the memory elements being loaded or stored.  OFFSET_BITS
3603    is the number of bits in each scalar offset and OFFSET_SIGN is the
3604    sign of the offset.  SCALE is the amount by which the offset should
3605    be multiplied *after* it has been converted to address width.
3606
3607    Return true if the function is supported, storing the function
3608    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3609
3610 bool
3611 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3612                           tree memory_type, unsigned int offset_bits,
3613                           signop offset_sign, int scale,
3614                           internal_fn *ifn_out, tree *element_type_out)
3615 {
3616   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3617   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3618   if (offset_bits > element_bits)
3619     /* Internal functions require the offset to be the same width as
3620        the vector elements.  We can extend narrower offsets, but it isn't
3621        safe to truncate wider offsets.  */
3622     return false;
3623
3624   if (element_bits != memory_bits)
3625     /* For now the vector elements must be the same width as the
3626        memory elements.  */
3627     return false;
3628
3629   /* Work out which function we need.  */
3630   internal_fn ifn;
3631   if (read_p)
3632     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3633   else
3634     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3635
3636   /* Test whether the target supports this combination.  */
3637   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3638                                                offset_sign, scale))
3639     return false;
3640
3641   *ifn_out = ifn;
3642   *element_type_out = TREE_TYPE (vectype);
3643   return true;
3644 }
3645
3646 /* CALL is a call to an internal gather load or scatter store function.
3647    Describe the operation in INFO.  */
3648
3649 static void
3650 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3651 {
3652   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3653   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3654   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3655
3656   info->ifn = gimple_call_internal_fn (call);
3657   info->decl = NULL_TREE;
3658   info->base = gimple_call_arg (call, 0);
3659   info->offset = gimple_call_arg (call, 1);
3660   info->offset_dt = vect_unknown_def_type;
3661   info->offset_vectype = NULL_TREE;
3662   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3663   info->element_type = TREE_TYPE (vectype);
3664   info->memory_type = TREE_TYPE (DR_REF (dr));
3665 }
3666
3667 /* Return true if a non-affine read or write in STMT is suitable for a
3668    gather load or scatter store.  Describe the operation in *INFO if so.  */
3669
3670 bool
3671 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3672                            gather_scatter_info *info)
3673 {
3674   HOST_WIDE_INT scale = 1;
3675   poly_int64 pbitpos, pbitsize;
3676   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3677   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3678   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3679   tree offtype = NULL_TREE;
3680   tree decl = NULL_TREE, base, off;
3681   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3682   tree memory_type = TREE_TYPE (DR_REF (dr));
3683   machine_mode pmode;
3684   int punsignedp, reversep, pvolatilep = 0;
3685   internal_fn ifn;
3686   tree element_type;
3687   bool masked_p = false;
3688
3689   /* See whether this is already a call to a gather/scatter internal function.
3690      If not, see whether it's a masked load or store.  */
3691   gcall *call = dyn_cast <gcall *> (stmt);
3692   if (call && gimple_call_internal_p (call))
3693     {
3694       ifn = gimple_call_internal_fn (stmt);
3695       if (internal_gather_scatter_fn_p (ifn))
3696         {
3697           vect_describe_gather_scatter_call (call, info);
3698           return true;
3699         }
3700       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3701     }
3702
3703   /* True if we should aim to use internal functions rather than
3704      built-in functions.  */
3705   bool use_ifn_p = (DR_IS_READ (dr)
3706                     ? supports_vec_gather_load_p ()
3707                     : supports_vec_scatter_store_p ());
3708
3709   base = DR_REF (dr);
3710   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3711      see if we can use the def stmt of the address.  */
3712   if (masked_p
3713       && TREE_CODE (base) == MEM_REF
3714       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3715       && integer_zerop (TREE_OPERAND (base, 1))
3716       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3717     {
3718       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3719       if (is_gimple_assign (def_stmt)
3720           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3721         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3722     }
3723
3724   /* The gather and scatter builtins need address of the form
3725      loop_invariant + vector * {1, 2, 4, 8}
3726      or
3727      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3728      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3729      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3730      multiplications and additions in it.  To get a vector, we need
3731      a single SSA_NAME that will be defined in the loop and will
3732      contain everything that is not loop invariant and that can be
3733      vectorized.  The following code attempts to find such a preexistng
3734      SSA_NAME OFF and put the loop invariants into a tree BASE
3735      that can be gimplified before the loop.  */
3736   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3737                               &punsignedp, &reversep, &pvolatilep);
3738   if (reversep)
3739     return false;
3740
3741   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3742
3743   if (TREE_CODE (base) == MEM_REF)
3744     {
3745       if (!integer_zerop (TREE_OPERAND (base, 1)))
3746         {
3747           if (off == NULL_TREE)
3748             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3749           else
3750             off = size_binop (PLUS_EXPR, off,
3751                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3752         }
3753       base = TREE_OPERAND (base, 0);
3754     }
3755   else
3756     base = build_fold_addr_expr (base);
3757
3758   if (off == NULL_TREE)
3759     off = size_zero_node;
3760
3761   /* If base is not loop invariant, either off is 0, then we start with just
3762      the constant offset in the loop invariant BASE and continue with base
3763      as OFF, otherwise give up.
3764      We could handle that case by gimplifying the addition of base + off
3765      into some SSA_NAME and use that as off, but for now punt.  */
3766   if (!expr_invariant_in_loop_p (loop, base))
3767     {
3768       if (!integer_zerop (off))
3769         return false;
3770       off = base;
3771       base = size_int (pbytepos);
3772     }
3773   /* Otherwise put base + constant offset into the loop invariant BASE
3774      and continue with OFF.  */
3775   else
3776     {
3777       base = fold_convert (sizetype, base);
3778       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3779     }
3780
3781   /* OFF at this point may be either a SSA_NAME or some tree expression
3782      from get_inner_reference.  Try to peel off loop invariants from it
3783      into BASE as long as possible.  */
3784   STRIP_NOPS (off);
3785   while (offtype == NULL_TREE)
3786     {
3787       enum tree_code code;
3788       tree op0, op1, add = NULL_TREE;
3789
3790       if (TREE_CODE (off) == SSA_NAME)
3791         {
3792           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3793
3794           if (expr_invariant_in_loop_p (loop, off))
3795             return false;
3796
3797           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3798             break;
3799
3800           op0 = gimple_assign_rhs1 (def_stmt);
3801           code = gimple_assign_rhs_code (def_stmt);
3802           op1 = gimple_assign_rhs2 (def_stmt);
3803         }
3804       else
3805         {
3806           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3807             return false;
3808           code = TREE_CODE (off);
3809           extract_ops_from_tree (off, &code, &op0, &op1);
3810         }
3811       switch (code)
3812         {
3813         case POINTER_PLUS_EXPR:
3814         case PLUS_EXPR:
3815           if (expr_invariant_in_loop_p (loop, op0))
3816             {
3817               add = op0;
3818               off = op1;
3819             do_add:
3820               add = fold_convert (sizetype, add);
3821               if (scale != 1)
3822                 add = size_binop (MULT_EXPR, add, size_int (scale));
3823               base = size_binop (PLUS_EXPR, base, add);
3824               continue;
3825             }
3826           if (expr_invariant_in_loop_p (loop, op1))
3827             {
3828               add = op1;
3829               off = op0;
3830               goto do_add;
3831             }
3832           break;
3833         case MINUS_EXPR:
3834           if (expr_invariant_in_loop_p (loop, op1))
3835             {
3836               add = fold_convert (sizetype, op1);
3837               add = size_binop (MINUS_EXPR, size_zero_node, add);
3838               off = op0;
3839               goto do_add;
3840             }
3841           break;
3842         case MULT_EXPR:
3843           if (scale == 1 && tree_fits_shwi_p (op1))
3844             {
3845               int new_scale = tree_to_shwi (op1);
3846               /* Only treat this as a scaling operation if the target
3847                  supports it.  */
3848               if (use_ifn_p
3849                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3850                                                 vectype, memory_type, 1,
3851                                                 TYPE_SIGN (TREE_TYPE (op0)),
3852                                                 new_scale, &ifn,
3853                                                 &element_type))
3854                 break;
3855               scale = new_scale;
3856               off = op0;
3857               continue;
3858             }
3859           break;
3860         case SSA_NAME:
3861           off = op0;
3862           continue;
3863         CASE_CONVERT:
3864           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3865               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3866             break;
3867           if (TYPE_PRECISION (TREE_TYPE (op0))
3868               == TYPE_PRECISION (TREE_TYPE (off)))
3869             {
3870               off = op0;
3871               continue;
3872             }
3873
3874           /* The internal functions need the offset to be the same width
3875              as the elements of VECTYPE.  Don't include operations that
3876              cast the offset from that width to a different width.  */
3877           if (use_ifn_p
3878               && (int_size_in_bytes (TREE_TYPE (vectype))
3879                   == int_size_in_bytes (TREE_TYPE (off))))
3880             break;
3881
3882           if (TYPE_PRECISION (TREE_TYPE (op0))
3883               < TYPE_PRECISION (TREE_TYPE (off)))
3884             {
3885               off = op0;
3886               offtype = TREE_TYPE (off);
3887               STRIP_NOPS (off);
3888               continue;
3889             }
3890           break;
3891         default:
3892           break;
3893         }
3894       break;
3895     }
3896
3897   /* If at the end OFF still isn't a SSA_NAME or isn't
3898      defined in the loop, punt.  */
3899   if (TREE_CODE (off) != SSA_NAME
3900       || expr_invariant_in_loop_p (loop, off))
3901     return false;
3902
3903   if (offtype == NULL_TREE)
3904     offtype = TREE_TYPE (off);
3905
3906   if (use_ifn_p)
3907     {
3908       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3909                                      memory_type, TYPE_PRECISION (offtype),
3910                                      TYPE_SIGN (offtype), scale, &ifn,
3911                                      &element_type))
3912         return false;
3913     }
3914   else
3915     {
3916       if (DR_IS_READ (dr))
3917         {
3918           if (targetm.vectorize.builtin_gather)
3919             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3920         }
3921       else
3922         {
3923           if (targetm.vectorize.builtin_scatter)
3924             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3925         }
3926
3927       if (!decl)
3928         return false;
3929
3930       ifn = IFN_LAST;
3931       element_type = TREE_TYPE (vectype);
3932     }
3933
3934   info->ifn = ifn;
3935   info->decl = decl;
3936   info->base = base;
3937   info->offset = off;
3938   info->offset_dt = vect_unknown_def_type;
3939   info->offset_vectype = NULL_TREE;
3940   info->scale = scale;
3941   info->element_type = element_type;
3942   info->memory_type = memory_type;
3943   return true;
3944 }
3945
3946 /* Find the data references in STMT, analyze them with respect to LOOP and
3947    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3948    be handled.  */
3949
3950 bool
3951 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3952                                vec<data_reference_p> *datarefs)
3953 {
3954   /* We can ignore clobbers for dataref analysis - they are removed during
3955      loop vectorization and BB vectorization checks dependences with a
3956      stmt walk.  */
3957   if (gimple_clobber_p (stmt))
3958     return true;
3959
3960   if (gimple_has_volatile_ops (stmt))
3961     {
3962       if (dump_enabled_p ())
3963         {
3964           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3965                            "not vectorized: volatile type ");
3966           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3967         }
3968       return false;
3969     }
3970
3971   if (stmt_can_throw_internal (stmt))
3972     {
3973       if (dump_enabled_p ())
3974         {
3975           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976                            "not vectorized: statement can throw an "
3977                            "exception ");
3978           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3979         }
3980       return false;
3981     }
3982
3983   auto_vec<data_reference_p, 2> refs;
3984   if (!find_data_references_in_stmt (loop, stmt, &refs))
3985     return false;
3986
3987   if (refs.is_empty ())
3988     return true;
3989
3990   if (refs.length () > 1)
3991     {
3992       if (dump_enabled_p ())
3993         {
3994           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3995                            "not vectorized: more than one data ref "
3996                            "in stmt: ");
3997           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3998         }
3999       return false;
4000     }
4001
4002   if (gcall *call = dyn_cast <gcall *> (stmt))
4003     if (!gimple_call_internal_p (call)
4004         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4005             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4006       {
4007         if (dump_enabled_p ())
4008           {
4009             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4010                              "not vectorized: dr in a call ");
4011             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4012           }
4013         return false;
4014       }
4015
4016   data_reference_p dr = refs.pop ();
4017   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4018       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4019     {
4020       if (dump_enabled_p ())
4021         {
4022           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4023                            "not vectorized: statement is bitfield "
4024                            "access ");
4025           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4026         }
4027       return false;
4028     }
4029
4030   if (DR_BASE_ADDRESS (dr)
4031       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4032     {
4033       if (dump_enabled_p ())
4034         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4035                          "not vectorized: base addr of dr is a "
4036                          "constant\n");
4037       return false;
4038     }
4039
4040   /* Check whether this may be a SIMD lane access and adjust the
4041      DR to make it easier for us to handle it.  */
4042   if (loop
4043       && loop->simduid
4044       && (!DR_BASE_ADDRESS (dr)
4045           || !DR_OFFSET (dr)
4046           || !DR_INIT (dr)
4047           || !DR_STEP (dr)))
4048     {
4049       struct data_reference *newdr
4050         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4051                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4052       if (DR_BASE_ADDRESS (newdr)
4053           && DR_OFFSET (newdr)
4054           && DR_INIT (newdr)
4055           && DR_STEP (newdr)
4056           && integer_zerop (DR_STEP (newdr)))
4057         {
4058           tree off = DR_OFFSET (newdr);
4059           STRIP_NOPS (off);
4060           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4061               && TREE_CODE (off) == MULT_EXPR
4062               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4063             {
4064               tree step = TREE_OPERAND (off, 1);
4065               off = TREE_OPERAND (off, 0);
4066               STRIP_NOPS (off);
4067               if (CONVERT_EXPR_P (off)
4068                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4069                       < TYPE_PRECISION (TREE_TYPE (off))))
4070                 off = TREE_OPERAND (off, 0);
4071               if (TREE_CODE (off) == SSA_NAME)
4072                 {
4073                   gimple *def = SSA_NAME_DEF_STMT (off);
4074                   tree reft = TREE_TYPE (DR_REF (newdr));
4075                   if (is_gimple_call (def)
4076                       && gimple_call_internal_p (def)
4077                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4078                     {
4079                       tree arg = gimple_call_arg (def, 0);
4080                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4081                       arg = SSA_NAME_VAR (arg);
4082                       if (arg == loop->simduid
4083                           /* For now.  */
4084                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4085                         {
4086                           DR_OFFSET (newdr) = ssize_int (0);
4087                           DR_STEP (newdr) = step;
4088                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4089                           DR_STEP_ALIGNMENT (newdr)
4090                             = highest_pow2_factor (step);
4091                           /* Mark as simd-lane access.  */
4092                           newdr->aux = (void *)-1;
4093                           free_data_ref (dr);
4094                           datarefs->safe_push (newdr);
4095                           return true;
4096                         }
4097                     }
4098                 }
4099             }
4100         }
4101       free_data_ref (newdr);
4102     }
4103
4104   datarefs->safe_push (dr);
4105   return true;
4106 }
4107
4108 /* Function vect_analyze_data_refs.
4109
4110   Find all the data references in the loop or basic block.
4111
4112    The general structure of the analysis of data refs in the vectorizer is as
4113    follows:
4114    1- vect_analyze_data_refs(loop/bb): call
4115       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4116       in the loop/bb and their dependences.
4117    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4118    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4119    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4120
4121 */
4122
4123 bool
4124 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4125 {
4126   struct loop *loop = NULL;
4127   unsigned int i;
4128   struct data_reference *dr;
4129   tree scalar_type;
4130
4131   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4132
4133   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4134     loop = LOOP_VINFO_LOOP (loop_vinfo);
4135
4136   /* Go through the data-refs, check that the analysis succeeded.  Update
4137      pointer from stmt_vec_info struct to DR and vectype.  */
4138
4139   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4140   FOR_EACH_VEC_ELT (datarefs, i, dr)
4141     {
4142       gimple *stmt;
4143       stmt_vec_info stmt_info;
4144       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4145       poly_uint64 vf;
4146
4147       gcc_assert (DR_REF (dr));
4148       stmt = vect_dr_stmt (dr);
4149       stmt_info = vinfo_for_stmt (stmt);
4150
4151       /* Check that analysis of the data-ref succeeded.  */
4152       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4153           || !DR_STEP (dr))
4154         {
4155           bool maybe_gather
4156             = DR_IS_READ (dr)
4157               && !TREE_THIS_VOLATILE (DR_REF (dr))
4158               && (targetm.vectorize.builtin_gather != NULL
4159                   || supports_vec_gather_load_p ());
4160           bool maybe_scatter
4161             = DR_IS_WRITE (dr)
4162               && !TREE_THIS_VOLATILE (DR_REF (dr))
4163               && (targetm.vectorize.builtin_scatter != NULL
4164                   || supports_vec_scatter_store_p ());
4165
4166           /* If target supports vector gather loads or scatter stores,
4167              see if they can't be used.  */
4168           if (is_a <loop_vec_info> (vinfo)
4169               && !nested_in_vect_loop_p (loop, stmt))
4170             {
4171               if (maybe_gather || maybe_scatter)
4172                 {
4173                   if (maybe_gather)
4174                     gatherscatter = GATHER;
4175                   else
4176                     gatherscatter = SCATTER;
4177                 }
4178             }
4179
4180           if (gatherscatter == SG_NONE)
4181             {
4182               if (dump_enabled_p ())
4183                 {
4184                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4185                                    "not vectorized: data ref analysis "
4186                                    "failed ");
4187                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4188                 }
4189               if (is_a <bb_vec_info> (vinfo))
4190                 {
4191                   /* In BB vectorization the ref can still participate
4192                      in dependence analysis, we just can't vectorize it.  */
4193                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4194                   continue;
4195                 }
4196               return false;
4197             }
4198         }
4199
4200       /* See if this was detected as SIMD lane access.  */
4201       if (dr->aux == (void *)-1)
4202         {
4203           if (nested_in_vect_loop_p (loop, stmt))
4204             {
4205               if (dump_enabled_p ())
4206                 {
4207                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4208                                    "not vectorized: data ref analysis "
4209                                    "failed ");
4210                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4211                 }
4212               return false;
4213             }
4214           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4215         }
4216
4217       tree base = get_base_address (DR_REF (dr));
4218       if (base && VAR_P (base) && DECL_NONALIASED (base))
4219         {
4220           if (dump_enabled_p ())
4221             {
4222               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223                                "not vectorized: base object not addressable "
4224                                "for stmt: ");
4225               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4226             }
4227           if (is_a <bb_vec_info> (vinfo))
4228             {
4229               /* In BB vectorization the ref can still participate
4230                  in dependence analysis, we just can't vectorize it.  */
4231               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4232               continue;
4233             }
4234           return false;
4235         }
4236
4237       if (is_a <loop_vec_info> (vinfo)
4238           && DR_STEP (dr)
4239           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4240         {
4241           if (nested_in_vect_loop_p (loop, stmt))
4242             {
4243               if (dump_enabled_p ())
4244                 {
4245                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4246                                    "not vectorized: not suitable for strided "
4247                                    "load ");
4248                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4249                 }
4250               return false;
4251             }
4252           STMT_VINFO_STRIDED_P (stmt_info) = true;
4253         }
4254
4255       /* Update DR field in stmt_vec_info struct.  */
4256
4257       /* If the dataref is in an inner-loop of the loop that is considered for
4258          for vectorization, we also want to analyze the access relative to
4259          the outer-loop (DR contains information only relative to the
4260          inner-most enclosing loop).  We do that by building a reference to the
4261          first location accessed by the inner-loop, and analyze it relative to
4262          the outer-loop.  */
4263       if (loop && nested_in_vect_loop_p (loop, stmt))
4264         {
4265           /* Build a reference to the first location accessed by the
4266              inner loop: *(BASE + INIT + OFFSET).  By construction,
4267              this address must be invariant in the inner loop, so we
4268              can consider it as being used in the outer loop.  */
4269           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4270           tree offset = unshare_expr (DR_OFFSET (dr));
4271           tree init = unshare_expr (DR_INIT (dr));
4272           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4273                                           init, offset);
4274           tree init_addr = fold_build_pointer_plus (base, init_offset);
4275           tree init_ref = build_fold_indirect_ref (init_addr);
4276
4277           if (dump_enabled_p ())
4278             {
4279               dump_printf_loc (MSG_NOTE, vect_location,
4280                                "analyze in outer loop: ");
4281               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4282               dump_printf (MSG_NOTE, "\n");
4283             }
4284
4285           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4286                                      init_ref, loop))
4287             /* dr_analyze_innermost already explained the failure.  */
4288             return false;
4289
4290           if (dump_enabled_p ())
4291             {
4292               dump_printf_loc (MSG_NOTE, vect_location,
4293                                "\touter base_address: ");
4294               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4295                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4296               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4297               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4298                                  STMT_VINFO_DR_OFFSET (stmt_info));
4299               dump_printf (MSG_NOTE,
4300                            "\n\touter constant offset from base address: ");
4301               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4302                                  STMT_VINFO_DR_INIT (stmt_info));
4303               dump_printf (MSG_NOTE, "\n\touter step: ");
4304               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4305                                  STMT_VINFO_DR_STEP (stmt_info));
4306               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4307                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4308               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4309                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4310               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4311                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4312               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4313                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4314             }
4315         }
4316
4317       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4318       STMT_VINFO_DATA_REF (stmt_info) = dr;
4319
4320       /* Set vectype for STMT.  */
4321       scalar_type = TREE_TYPE (DR_REF (dr));
4322       STMT_VINFO_VECTYPE (stmt_info)
4323         = get_vectype_for_scalar_type (scalar_type);
4324       if (!STMT_VINFO_VECTYPE (stmt_info))
4325         {
4326           if (dump_enabled_p ())
4327             {
4328               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4329                                "not vectorized: no vectype for stmt: ");
4330               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4331               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4332               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4333                                  scalar_type);
4334               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4335             }
4336
4337           if (is_a <bb_vec_info> (vinfo))
4338             {
4339               /* No vector type is fine, the ref can still participate
4340                  in dependence analysis, we just can't vectorize it.  */
4341               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4342               continue;
4343             }
4344           return false;
4345         }
4346       else
4347         {
4348           if (dump_enabled_p ())
4349             {
4350               dump_printf_loc (MSG_NOTE, vect_location,
4351                                "got vectype for stmt: ");
4352               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4353               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4354                                  STMT_VINFO_VECTYPE (stmt_info));
4355               dump_printf (MSG_NOTE, "\n");
4356             }
4357         }
4358
4359       /* Adjust the minimal vectorization factor according to the
4360          vector type.  */
4361       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4362       *min_vf = upper_bound (*min_vf, vf);
4363
4364       if (gatherscatter != SG_NONE)
4365         {
4366           gather_scatter_info gs_info;
4367           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4368                                           &gs_info)
4369               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4370             {
4371               if (dump_enabled_p ())
4372                 {
4373                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4374                                    (gatherscatter == GATHER) ?
4375                                    "not vectorized: not suitable for gather "
4376                                    "load " :
4377                                    "not vectorized: not suitable for scatter "
4378                                    "store ");
4379                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4380                 }
4381               return false;
4382             }
4383           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4384         }
4385     }
4386
4387   /* We used to stop processing and prune the list here.  Verify we no
4388      longer need to.  */
4389   gcc_assert (i == datarefs.length ());
4390
4391   return true;
4392 }
4393
4394
4395 /* Function vect_get_new_vect_var.
4396
4397    Returns a name for a new variable.  The current naming scheme appends the
4398    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4399    the name of vectorizer generated variables, and appends that to NAME if
4400    provided.  */
4401
4402 tree
4403 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4404 {
4405   const char *prefix;
4406   tree new_vect_var;
4407
4408   switch (var_kind)
4409   {
4410   case vect_simple_var:
4411     prefix = "vect";
4412     break;
4413   case vect_scalar_var:
4414     prefix = "stmp";
4415     break;
4416   case vect_mask_var:
4417     prefix = "mask";
4418     break;
4419   case vect_pointer_var:
4420     prefix = "vectp";
4421     break;
4422   default:
4423     gcc_unreachable ();
4424   }
4425
4426   if (name)
4427     {
4428       char* tmp = concat (prefix, "_", name, NULL);
4429       new_vect_var = create_tmp_reg (type, tmp);
4430       free (tmp);
4431     }
4432   else
4433     new_vect_var = create_tmp_reg (type, prefix);
4434
4435   return new_vect_var;
4436 }
4437
4438 /* Like vect_get_new_vect_var but return an SSA name.  */
4439
4440 tree
4441 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4442 {
4443   const char *prefix;
4444   tree new_vect_var;
4445
4446   switch (var_kind)
4447   {
4448   case vect_simple_var:
4449     prefix = "vect";
4450     break;
4451   case vect_scalar_var:
4452     prefix = "stmp";
4453     break;
4454   case vect_pointer_var:
4455     prefix = "vectp";
4456     break;
4457   default:
4458     gcc_unreachable ();
4459   }
4460
4461   if (name)
4462     {
4463       char* tmp = concat (prefix, "_", name, NULL);
4464       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4465       free (tmp);
4466     }
4467   else
4468     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4469
4470   return new_vect_var;
4471 }
4472
4473 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4474
4475 static void
4476 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4477 {
4478   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4479   int misalign = DR_MISALIGNMENT (dr);
4480   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4481     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4482   else
4483     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4484                             DR_TARGET_ALIGNMENT (dr), misalign);
4485 }
4486
4487 /* Function vect_create_addr_base_for_vector_ref.
4488
4489    Create an expression that computes the address of the first memory location
4490    that will be accessed for a data reference.
4491
4492    Input:
4493    STMT: The statement containing the data reference.
4494    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4495    OFFSET: Optional. If supplied, it is be added to the initial address.
4496    LOOP:    Specify relative to which loop-nest should the address be computed.
4497             For example, when the dataref is in an inner-loop nested in an
4498             outer-loop that is now being vectorized, LOOP can be either the
4499             outer-loop, or the inner-loop.  The first memory location accessed
4500             by the following dataref ('in' points to short):
4501
4502                 for (i=0; i<N; i++)
4503                    for (j=0; j<M; j++)
4504                      s += in[i+j]
4505
4506             is as follows:
4507             if LOOP=i_loop:     &in             (relative to i_loop)
4508             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4509    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4510             initial address.  Unlike OFFSET, which is number of elements to
4511             be added, BYTE_OFFSET is measured in bytes.
4512
4513    Output:
4514    1. Return an SSA_NAME whose value is the address of the memory location of
4515       the first vector of the data reference.
4516    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4517       these statement(s) which define the returned SSA_NAME.
4518
4519    FORNOW: We are only handling array accesses with step 1.  */
4520
4521 tree
4522 vect_create_addr_base_for_vector_ref (gimple *stmt,
4523                                       gimple_seq *new_stmt_list,
4524                                       tree offset,
4525                                       tree byte_offset)
4526 {
4527   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4528   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4529   const char *base_name;
4530   tree addr_base;
4531   tree dest;
4532   gimple_seq seq = NULL;
4533   tree vect_ptr_type;
4534   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4535   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4536   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4537
4538   tree data_ref_base = unshare_expr (drb->base_address);
4539   tree base_offset = unshare_expr (drb->offset);
4540   tree init = unshare_expr (drb->init);
4541
4542   if (loop_vinfo)
4543     base_name = get_name (data_ref_base);
4544   else
4545     {
4546       base_offset = ssize_int (0);
4547       init = ssize_int (0);
4548       base_name = get_name (DR_REF (dr));
4549     }
4550
4551   /* Create base_offset */
4552   base_offset = size_binop (PLUS_EXPR,
4553                             fold_convert (sizetype, base_offset),
4554                             fold_convert (sizetype, init));
4555
4556   if (offset)
4557     {
4558       offset = fold_build2 (MULT_EXPR, sizetype,
4559                             fold_convert (sizetype, offset), step);
4560       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4561                                  base_offset, offset);
4562     }
4563   if (byte_offset)
4564     {
4565       byte_offset = fold_convert (sizetype, byte_offset);
4566       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4567                                  base_offset, byte_offset);
4568     }
4569
4570   /* base + base_offset */
4571   if (loop_vinfo)
4572     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4573   else
4574     {
4575       addr_base = build1 (ADDR_EXPR,
4576                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4577                           unshare_expr (DR_REF (dr)));
4578     }
4579
4580   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4581   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4582   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4583   gimple_seq_add_seq (new_stmt_list, seq);
4584
4585   if (DR_PTR_INFO (dr)
4586       && TREE_CODE (addr_base) == SSA_NAME
4587       && !SSA_NAME_PTR_INFO (addr_base))
4588     {
4589       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4590       if (offset || byte_offset)
4591         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4592     }
4593
4594   if (dump_enabled_p ())
4595     {
4596       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4597       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4598       dump_printf (MSG_NOTE, "\n");
4599     }
4600
4601   return addr_base;
4602 }
4603
4604
4605 /* Function vect_create_data_ref_ptr.
4606
4607    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4608    location accessed in the loop by STMT, along with the def-use update
4609    chain to appropriately advance the pointer through the loop iterations.
4610    Also set aliasing information for the pointer.  This pointer is used by
4611    the callers to this function to create a memory reference expression for
4612    vector load/store access.
4613
4614    Input:
4615    1. STMT: a stmt that references memory. Expected to be of the form
4616          GIMPLE_ASSIGN <name, data-ref> or
4617          GIMPLE_ASSIGN <data-ref, name>.
4618    2. AGGR_TYPE: the type of the reference, which should be either a vector
4619         or an array.
4620    3. AT_LOOP: the loop where the vector memref is to be created.
4621    4. OFFSET (optional): an offset to be added to the initial address accessed
4622         by the data-ref in STMT.
4623    5. BSI: location where the new stmts are to be placed if there is no loop
4624    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4625         pointing to the initial address.
4626    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4627         to the initial address accessed by the data-ref in STMT.  This is
4628         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4629         in bytes.
4630    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4631         to the IV during each iteration of the loop.  NULL says to move
4632         by one copy of AGGR_TYPE up or down, depending on the step of the
4633         data reference.
4634
4635    Output:
4636    1. Declare a new ptr to vector_type, and have it point to the base of the
4637       data reference (initial addressed accessed by the data reference).
4638       For example, for vector of type V8HI, the following code is generated:
4639
4640       v8hi *ap;
4641       ap = (v8hi *)initial_address;
4642
4643       if OFFSET is not supplied:
4644          initial_address = &a[init];
4645       if OFFSET is supplied:
4646          initial_address = &a[init + OFFSET];
4647       if BYTE_OFFSET is supplied:
4648          initial_address = &a[init] + BYTE_OFFSET;
4649
4650       Return the initial_address in INITIAL_ADDRESS.
4651
4652    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4653       update the pointer in each iteration of the loop.
4654
4655       Return the increment stmt that updates the pointer in PTR_INCR.
4656
4657    3. Set INV_P to true if the access pattern of the data reference in the
4658       vectorized loop is invariant.  Set it to false otherwise.
4659
4660    4. Return the pointer.  */
4661
4662 tree
4663 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4664                           tree offset, tree *initial_address,
4665                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4666                           bool only_init, bool *inv_p, tree byte_offset,
4667                           tree iv_step)
4668 {
4669   const char *base_name;
4670   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4671   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4672   struct loop *loop = NULL;
4673   bool nested_in_vect_loop = false;
4674   struct loop *containing_loop = NULL;
4675   tree aggr_ptr_type;
4676   tree aggr_ptr;
4677   tree new_temp;
4678   gimple_seq new_stmt_list = NULL;
4679   edge pe = NULL;
4680   basic_block new_bb;
4681   tree aggr_ptr_init;
4682   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4683   tree aptr;
4684   gimple_stmt_iterator incr_gsi;
4685   bool insert_after;
4686   tree indx_before_incr, indx_after_incr;
4687   gimple *incr;
4688   tree step;
4689   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4690
4691   gcc_assert (iv_step != NULL_TREE
4692               || TREE_CODE (aggr_type) == ARRAY_TYPE
4693               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4694
4695   if (loop_vinfo)
4696     {
4697       loop = LOOP_VINFO_LOOP (loop_vinfo);
4698       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4699       containing_loop = (gimple_bb (stmt))->loop_father;
4700       pe = loop_preheader_edge (loop);
4701     }
4702   else
4703     {
4704       gcc_assert (bb_vinfo);
4705       only_init = true;
4706       *ptr_incr = NULL;
4707     }
4708
4709   /* Check the step (evolution) of the load in LOOP, and record
4710      whether it's invariant.  */
4711   step = vect_dr_behavior (dr)->step;
4712   if (integer_zerop (step))
4713     *inv_p = true;
4714   else
4715     *inv_p = false;
4716
4717   /* Create an expression for the first address accessed by this load
4718      in LOOP.  */
4719   base_name = get_name (DR_BASE_ADDRESS (dr));
4720
4721   if (dump_enabled_p ())
4722     {
4723       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4724       dump_printf_loc (MSG_NOTE, vect_location,
4725                        "create %s-pointer variable to type: ",
4726                        get_tree_code_name (TREE_CODE (aggr_type)));
4727       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4728       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4729         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4730       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4731         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4732       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4733         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4734       else
4735         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4736       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4737       dump_printf (MSG_NOTE, "\n");
4738     }
4739
4740   /* (1) Create the new aggregate-pointer variable.
4741      Vector and array types inherit the alias set of their component
4742      type by default so we need to use a ref-all pointer if the data
4743      reference does not conflict with the created aggregated data
4744      reference because it is not addressable.  */
4745   bool need_ref_all = false;
4746   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4747                               get_alias_set (DR_REF (dr))))
4748     need_ref_all = true;
4749   /* Likewise for any of the data references in the stmt group.  */
4750   else if (DR_GROUP_SIZE (stmt_info) > 1)
4751     {
4752       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4753       do
4754         {
4755           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4756           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4757           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4758                                       get_alias_set (DR_REF (sdr))))
4759             {
4760               need_ref_all = true;
4761               break;
4762             }
4763           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4764         }
4765       while (orig_stmt);
4766     }
4767   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4768                                                need_ref_all);
4769   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4770
4771
4772   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4773      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4774      def-use update cycles for the pointer: one relative to the outer-loop
4775      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4776      to the inner-loop (which is the inner-most loop containing the dataref),
4777      and this is done be step (5) below.
4778
4779      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4780      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4781      redundant.  Steps (3),(4) create the following:
4782
4783         vp0 = &base_addr;
4784         LOOP:   vp1 = phi(vp0,vp2)
4785                 ...
4786                 ...
4787                 vp2 = vp1 + step
4788                 goto LOOP
4789
4790      If there is an inner-loop nested in loop, then step (5) will also be
4791      applied, and an additional update in the inner-loop will be created:
4792
4793         vp0 = &base_addr;
4794         LOOP:   vp1 = phi(vp0,vp2)
4795                 ...
4796         inner:     vp3 = phi(vp1,vp4)
4797                    vp4 = vp3 + inner_step
4798                    if () goto inner
4799                 ...
4800                 vp2 = vp1 + step
4801                 if () goto LOOP   */
4802
4803   /* (2) Calculate the initial address of the aggregate-pointer, and set
4804      the aggregate-pointer to point to it before the loop.  */
4805
4806   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4807
4808   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4809                                                    offset, byte_offset);
4810   if (new_stmt_list)
4811     {
4812       if (pe)
4813         {
4814           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4815           gcc_assert (!new_bb);
4816         }
4817       else
4818         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4819     }
4820
4821   *initial_address = new_temp;
4822   aggr_ptr_init = new_temp;
4823
4824   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4825      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4826      inner-loop nested in LOOP (during outer-loop vectorization).  */
4827
4828   /* No update in loop is required.  */
4829   if (only_init && (!loop_vinfo || at_loop == loop))
4830     aptr = aggr_ptr_init;
4831   else
4832     {
4833       if (iv_step == NULL_TREE)
4834         {
4835           /* The step of the aggregate pointer is the type size.  */
4836           iv_step = TYPE_SIZE_UNIT (aggr_type);
4837           /* One exception to the above is when the scalar step of the load in
4838              LOOP is zero. In this case the step here is also zero.  */
4839           if (*inv_p)
4840             iv_step = size_zero_node;
4841           else if (tree_int_cst_sgn (step) == -1)
4842             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4843         }
4844
4845       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4846
4847       create_iv (aggr_ptr_init,
4848                  fold_convert (aggr_ptr_type, iv_step),
4849                  aggr_ptr, loop, &incr_gsi, insert_after,
4850                  &indx_before_incr, &indx_after_incr);
4851       incr = gsi_stmt (incr_gsi);
4852       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4853
4854       /* Copy the points-to information if it exists. */
4855       if (DR_PTR_INFO (dr))
4856         {
4857           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4858           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4859         }
4860       if (ptr_incr)
4861         *ptr_incr = incr;
4862
4863       aptr = indx_before_incr;
4864     }
4865
4866   if (!nested_in_vect_loop || only_init)
4867     return aptr;
4868
4869
4870   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4871      nested in LOOP, if exists.  */
4872
4873   gcc_assert (nested_in_vect_loop);
4874   if (!only_init)
4875     {
4876       standard_iv_increment_position (containing_loop, &incr_gsi,
4877                                       &insert_after);
4878       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4879                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4880                  &indx_after_incr);
4881       incr = gsi_stmt (incr_gsi);
4882       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4883
4884       /* Copy the points-to information if it exists. */
4885       if (DR_PTR_INFO (dr))
4886         {
4887           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4888           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4889         }
4890       if (ptr_incr)
4891         *ptr_incr = incr;
4892
4893       return indx_before_incr;
4894     }
4895   else
4896     gcc_unreachable ();
4897 }
4898
4899
4900 /* Function bump_vector_ptr
4901
4902    Increment a pointer (to a vector type) by vector-size. If requested,
4903    i.e. if PTR-INCR is given, then also connect the new increment stmt
4904    to the existing def-use update-chain of the pointer, by modifying
4905    the PTR_INCR as illustrated below:
4906
4907    The pointer def-use update-chain before this function:
4908                         DATAREF_PTR = phi (p_0, p_2)
4909                         ....
4910         PTR_INCR:       p_2 = DATAREF_PTR + step
4911
4912    The pointer def-use update-chain after this function:
4913                         DATAREF_PTR = phi (p_0, p_2)
4914                         ....
4915                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4916                         ....
4917         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4918
4919    Input:
4920    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4921                  in the loop.
4922    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4923               the loop.  The increment amount across iterations is expected
4924               to be vector_size.
4925    BSI - location where the new update stmt is to be placed.
4926    STMT - the original scalar memory-access stmt that is being vectorized.
4927    BUMP - optional. The offset by which to bump the pointer. If not given,
4928           the offset is assumed to be vector_size.
4929
4930    Output: Return NEW_DATAREF_PTR as illustrated above.
4931
4932 */
4933
4934 tree
4935 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4936                  gimple *stmt, tree bump)
4937 {
4938   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4939   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4940   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941   tree update = TYPE_SIZE_UNIT (vectype);
4942   gassign *incr_stmt;
4943   ssa_op_iter iter;
4944   use_operand_p use_p;
4945   tree new_dataref_ptr;
4946
4947   if (bump)
4948     update = bump;
4949
4950   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4951     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4952   else
4953     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4954   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4955                                    dataref_ptr, update);
4956   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4957
4958   /* Copy the points-to information if it exists. */
4959   if (DR_PTR_INFO (dr))
4960     {
4961       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4962       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4963     }
4964
4965   if (!ptr_incr)
4966     return new_dataref_ptr;
4967
4968   /* Update the vector-pointer's cross-iteration increment.  */
4969   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4970     {
4971       tree use = USE_FROM_PTR (use_p);
4972
4973       if (use == dataref_ptr)
4974         SET_USE (use_p, new_dataref_ptr);
4975       else
4976         gcc_assert (operand_equal_p (use, update, 0));
4977     }
4978
4979   return new_dataref_ptr;
4980 }
4981
4982
4983 /* Copy memory reference info such as base/clique from the SRC reference
4984    to the DEST MEM_REF.  */
4985
4986 void
4987 vect_copy_ref_info (tree dest, tree src)
4988 {
4989   if (TREE_CODE (dest) != MEM_REF)
4990     return;
4991
4992   tree src_base = src;
4993   while (handled_component_p (src_base))
4994     src_base = TREE_OPERAND (src_base, 0);
4995   if (TREE_CODE (src_base) != MEM_REF
4996       && TREE_CODE (src_base) != TARGET_MEM_REF)
4997     return;
4998
4999   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5000   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5001 }
5002
5003
5004 /* Function vect_create_destination_var.
5005
5006    Create a new temporary of type VECTYPE.  */
5007
5008 tree
5009 vect_create_destination_var (tree scalar_dest, tree vectype)
5010 {
5011   tree vec_dest;
5012   const char *name;
5013   char *new_name;
5014   tree type;
5015   enum vect_var_kind kind;
5016
5017   kind = vectype
5018     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5019     ? vect_mask_var
5020     : vect_simple_var
5021     : vect_scalar_var;
5022   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5023
5024   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5025
5026   name = get_name (scalar_dest);
5027   if (name)
5028     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5029   else
5030     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5031   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5032   free (new_name);
5033
5034   return vec_dest;
5035 }
5036
5037 /* Function vect_grouped_store_supported.
5038
5039    Returns TRUE if interleave high and interleave low permutations
5040    are supported, and FALSE otherwise.  */
5041
5042 bool
5043 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5044 {
5045   machine_mode mode = TYPE_MODE (vectype);
5046
5047   /* vect_permute_store_chain requires the group size to be equal to 3 or
5048      be a power of two.  */
5049   if (count != 3 && exact_log2 (count) == -1)
5050     {
5051       if (dump_enabled_p ())
5052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053                          "the size of the group of accesses"
5054                          " is not a power of 2 or not eqaul to 3\n");
5055       return false;
5056     }
5057
5058   /* Check that the permutation is supported.  */
5059   if (VECTOR_MODE_P (mode))
5060     {
5061       unsigned int i;
5062       if (count == 3)
5063         {
5064           unsigned int j0 = 0, j1 = 0, j2 = 0;
5065           unsigned int i, j;
5066
5067           unsigned int nelt;
5068           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5069             {
5070               if (dump_enabled_p ())
5071                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072                                  "cannot handle groups of 3 stores for"
5073                                  " variable-length vectors\n");
5074               return false;
5075             }
5076
5077           vec_perm_builder sel (nelt, nelt, 1);
5078           sel.quick_grow (nelt);
5079           vec_perm_indices indices;
5080           for (j = 0; j < 3; j++)
5081             {
5082               int nelt0 = ((3 - j) * nelt) % 3;
5083               int nelt1 = ((3 - j) * nelt + 1) % 3;
5084               int nelt2 = ((3 - j) * nelt + 2) % 3;
5085               for (i = 0; i < nelt; i++)
5086                 {
5087                   if (3 * i + nelt0 < nelt)
5088                     sel[3 * i + nelt0] = j0++;
5089                   if (3 * i + nelt1 < nelt)
5090                     sel[3 * i + nelt1] = nelt + j1++;
5091                   if (3 * i + nelt2 < nelt)
5092                     sel[3 * i + nelt2] = 0;
5093                 }
5094               indices.new_vector (sel, 2, nelt);
5095               if (!can_vec_perm_const_p (mode, indices))
5096                 {
5097                   if (dump_enabled_p ())
5098                     dump_printf (MSG_MISSED_OPTIMIZATION,
5099                                  "permutation op not supported by target.\n");
5100                   return false;
5101                 }
5102
5103               for (i = 0; i < nelt; i++)
5104                 {
5105                   if (3 * i + nelt0 < nelt)
5106                     sel[3 * i + nelt0] = 3 * i + nelt0;
5107                   if (3 * i + nelt1 < nelt)
5108                     sel[3 * i + nelt1] = 3 * i + nelt1;
5109                   if (3 * i + nelt2 < nelt)
5110                     sel[3 * i + nelt2] = nelt + j2++;
5111                 }
5112               indices.new_vector (sel, 2, nelt);
5113               if (!can_vec_perm_const_p (mode, indices))
5114                 {
5115                   if (dump_enabled_p ())
5116                     dump_printf (MSG_MISSED_OPTIMIZATION,
5117                                  "permutation op not supported by target.\n");
5118                   return false;
5119                 }
5120             }
5121           return true;
5122         }
5123       else
5124         {
5125           /* If length is not equal to 3 then only power of 2 is supported.  */
5126           gcc_assert (pow2p_hwi (count));
5127           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5128
5129           /* The encoding has 2 interleaved stepped patterns.  */
5130           vec_perm_builder sel (nelt, 2, 3);
5131           sel.quick_grow (6);
5132           for (i = 0; i < 3; i++)
5133             {
5134               sel[i * 2] = i;
5135               sel[i * 2 + 1] = i + nelt;
5136             }
5137           vec_perm_indices indices (sel, 2, nelt);
5138           if (can_vec_perm_const_p (mode, indices))
5139             {
5140               for (i = 0; i < 6; i++)
5141                 sel[i] += exact_div (nelt, 2);
5142               indices.new_vector (sel, 2, nelt);
5143               if (can_vec_perm_const_p (mode, indices))
5144                 return true;
5145             }
5146         }
5147     }
5148
5149   if (dump_enabled_p ())
5150     dump_printf (MSG_MISSED_OPTIMIZATION,
5151                  "permutaion op not supported by target.\n");
5152   return false;
5153 }
5154
5155
5156 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5157    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5158
5159 bool
5160 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5161                             bool masked_p)
5162 {
5163   if (masked_p)
5164     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5165                                          vec_mask_store_lanes_optab,
5166                                          vectype, count);
5167   else
5168     return vect_lanes_optab_supported_p ("vec_store_lanes",
5169                                          vec_store_lanes_optab,
5170                                          vectype, count);
5171 }
5172
5173
5174 /* Function vect_permute_store_chain.
5175
5176    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5177    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5178    the data correctly for the stores.  Return the final references for stores
5179    in RESULT_CHAIN.
5180
5181    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5182    The input is 4 vectors each containing 8 elements.  We assign a number to
5183    each element, the input sequence is:
5184
5185    1st vec:   0  1  2  3  4  5  6  7
5186    2nd vec:   8  9 10 11 12 13 14 15
5187    3rd vec:  16 17 18 19 20 21 22 23
5188    4th vec:  24 25 26 27 28 29 30 31
5189
5190    The output sequence should be:
5191
5192    1st vec:  0  8 16 24  1  9 17 25
5193    2nd vec:  2 10 18 26  3 11 19 27
5194    3rd vec:  4 12 20 28  5 13 21 30
5195    4th vec:  6 14 22 30  7 15 23 31
5196
5197    i.e., we interleave the contents of the four vectors in their order.
5198
5199    We use interleave_high/low instructions to create such output.  The input of
5200    each interleave_high/low operation is two vectors:
5201    1st vec    2nd vec
5202    0 1 2 3    4 5 6 7
5203    the even elements of the result vector are obtained left-to-right from the
5204    high/low elements of the first vector.  The odd elements of the result are
5205    obtained left-to-right from the high/low elements of the second vector.
5206    The output of interleave_high will be:   0 4 1 5
5207    and of interleave_low:                   2 6 3 7
5208
5209
5210    The permutation is done in log LENGTH stages.  In each stage interleave_high
5211    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5212    where the first argument is taken from the first half of DR_CHAIN and the
5213    second argument from it's second half.
5214    In our example,
5215
5216    I1: interleave_high (1st vec, 3rd vec)
5217    I2: interleave_low (1st vec, 3rd vec)
5218    I3: interleave_high (2nd vec, 4th vec)
5219    I4: interleave_low (2nd vec, 4th vec)
5220
5221    The output for the first stage is:
5222
5223    I1:  0 16  1 17  2 18  3 19
5224    I2:  4 20  5 21  6 22  7 23
5225    I3:  8 24  9 25 10 26 11 27
5226    I4: 12 28 13 29 14 30 15 31
5227
5228    The output of the second stage, i.e. the final result is:
5229
5230    I1:  0  8 16 24  1  9 17 25
5231    I2:  2 10 18 26  3 11 19 27
5232    I3:  4 12 20 28  5 13 21 30
5233    I4:  6 14 22 30  7 15 23 31.  */
5234
5235 void
5236 vect_permute_store_chain (vec<tree> dr_chain,
5237                           unsigned int length,
5238                           gimple *stmt,
5239                           gimple_stmt_iterator *gsi,
5240                           vec<tree> *result_chain)
5241 {
5242   tree vect1, vect2, high, low;
5243   gimple *perm_stmt;
5244   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5245   tree perm_mask_low, perm_mask_high;
5246   tree data_ref;
5247   tree perm3_mask_low, perm3_mask_high;
5248   unsigned int i, j, n, log_length = exact_log2 (length);
5249
5250   result_chain->quick_grow (length);
5251   memcpy (result_chain->address (), dr_chain.address (),
5252           length * sizeof (tree));
5253
5254   if (length == 3)
5255     {
5256       /* vect_grouped_store_supported ensures that this is constant.  */
5257       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5258       unsigned int j0 = 0, j1 = 0, j2 = 0;
5259
5260       vec_perm_builder sel (nelt, nelt, 1);
5261       sel.quick_grow (nelt);
5262       vec_perm_indices indices;
5263       for (j = 0; j < 3; j++)
5264         {
5265           int nelt0 = ((3 - j) * nelt) % 3;
5266           int nelt1 = ((3 - j) * nelt + 1) % 3;
5267           int nelt2 = ((3 - j) * nelt + 2) % 3;
5268
5269           for (i = 0; i < nelt; i++)
5270             {
5271               if (3 * i + nelt0 < nelt)
5272                 sel[3 * i + nelt0] = j0++;
5273               if (3 * i + nelt1 < nelt)
5274                 sel[3 * i + nelt1] = nelt + j1++;
5275               if (3 * i + nelt2 < nelt)
5276                 sel[3 * i + nelt2] = 0;
5277             }
5278           indices.new_vector (sel, 2, nelt);
5279           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5280
5281           for (i = 0; i < nelt; i++)
5282             {
5283               if (3 * i + nelt0 < nelt)
5284                 sel[3 * i + nelt0] = 3 * i + nelt0;
5285               if (3 * i + nelt1 < nelt)
5286                 sel[3 * i + nelt1] = 3 * i + nelt1;
5287               if (3 * i + nelt2 < nelt)
5288                 sel[3 * i + nelt2] = nelt + j2++;
5289             }
5290           indices.new_vector (sel, 2, nelt);
5291           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5292
5293           vect1 = dr_chain[0];
5294           vect2 = dr_chain[1];
5295
5296           /* Create interleaving stmt:
5297              low = VEC_PERM_EXPR <vect1, vect2,
5298                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5299                                    j + 2, nelt + j + 2, *, ...}>  */
5300           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5302                                            vect2, perm3_mask_low);
5303           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5304
5305           vect1 = data_ref;
5306           vect2 = dr_chain[2];
5307           /* Create interleaving stmt:
5308              low = VEC_PERM_EXPR <vect1, vect2,
5309                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5310                                    6, 7, nelt + j + 2, ...}>  */
5311           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5312           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313                                            vect2, perm3_mask_high);
5314           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5315           (*result_chain)[j] = data_ref;
5316         }
5317     }
5318   else
5319     {
5320       /* If length is not equal to 3 then only power of 2 is supported.  */
5321       gcc_assert (pow2p_hwi (length));
5322
5323       /* The encoding has 2 interleaved stepped patterns.  */
5324       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325       vec_perm_builder sel (nelt, 2, 3);
5326       sel.quick_grow (6);
5327       for (i = 0; i < 3; i++)
5328         {
5329           sel[i * 2] = i;
5330           sel[i * 2 + 1] = i + nelt;
5331         }
5332         vec_perm_indices indices (sel, 2, nelt);
5333         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5334
5335         for (i = 0; i < 6; i++)
5336           sel[i] += exact_div (nelt, 2);
5337         indices.new_vector (sel, 2, nelt);
5338         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5339
5340         for (i = 0, n = log_length; i < n; i++)
5341           {
5342             for (j = 0; j < length/2; j++)
5343               {
5344                 vect1 = dr_chain[j];
5345                 vect2 = dr_chain[j+length/2];
5346
5347                 /* Create interleaving stmt:
5348                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5349                                                         ...}>  */
5350                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5351                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5352                                                  vect2, perm_mask_high);
5353                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354                 (*result_chain)[2*j] = high;
5355
5356                 /* Create interleaving stmt:
5357                    low = VEC_PERM_EXPR <vect1, vect2,
5358                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5359                                          ...}>  */
5360                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5361                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5362                                                  vect2, perm_mask_low);
5363                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5364                 (*result_chain)[2*j+1] = low;
5365               }
5366             memcpy (dr_chain.address (), result_chain->address (),
5367                     length * sizeof (tree));
5368           }
5369     }
5370 }
5371
5372 /* Function vect_setup_realignment
5373
5374    This function is called when vectorizing an unaligned load using
5375    the dr_explicit_realign[_optimized] scheme.
5376    This function generates the following code at the loop prolog:
5377
5378       p = initial_addr;
5379    x  msq_init = *(floor(p));   # prolog load
5380       realignment_token = call target_builtin;
5381     loop:
5382    x  msq = phi (msq_init, ---)
5383
5384    The stmts marked with x are generated only for the case of
5385    dr_explicit_realign_optimized.
5386
5387    The code above sets up a new (vector) pointer, pointing to the first
5388    location accessed by STMT, and a "floor-aligned" load using that pointer.
5389    It also generates code to compute the "realignment-token" (if the relevant
5390    target hook was defined), and creates a phi-node at the loop-header bb
5391    whose arguments are the result of the prolog-load (created by this
5392    function) and the result of a load that takes place in the loop (to be
5393    created by the caller to this function).
5394
5395    For the case of dr_explicit_realign_optimized:
5396    The caller to this function uses the phi-result (msq) to create the
5397    realignment code inside the loop, and sets up the missing phi argument,
5398    as follows:
5399     loop:
5400       msq = phi (msq_init, lsq)
5401       lsq = *(floor(p'));        # load in loop
5402       result = realign_load (msq, lsq, realignment_token);
5403
5404    For the case of dr_explicit_realign:
5405     loop:
5406       msq = *(floor(p));        # load in loop
5407       p' = p + (VS-1);
5408       lsq = *(floor(p'));       # load in loop
5409       result = realign_load (msq, lsq, realignment_token);
5410
5411    Input:
5412    STMT - (scalar) load stmt to be vectorized. This load accesses
5413           a memory location that may be unaligned.
5414    BSI - place where new code is to be inserted.
5415    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5416                               is used.
5417
5418    Output:
5419    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5420                        target hook, if defined.
5421    Return value - the result of the loop-header phi node.  */
5422
5423 tree
5424 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5425                         tree *realignment_token,
5426                         enum dr_alignment_support alignment_support_scheme,
5427                         tree init_addr,
5428                         struct loop **at_loop)
5429 {
5430   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5431   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5432   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5433   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5434   struct loop *loop = NULL;
5435   edge pe = NULL;
5436   tree scalar_dest = gimple_assign_lhs (stmt);
5437   tree vec_dest;
5438   gimple *inc;
5439   tree ptr;
5440   tree data_ref;
5441   basic_block new_bb;
5442   tree msq_init = NULL_TREE;
5443   tree new_temp;
5444   gphi *phi_stmt;
5445   tree msq = NULL_TREE;
5446   gimple_seq stmts = NULL;
5447   bool inv_p;
5448   bool compute_in_loop = false;
5449   bool nested_in_vect_loop = false;
5450   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5451   struct loop *loop_for_initial_load = NULL;
5452
5453   if (loop_vinfo)
5454     {
5455       loop = LOOP_VINFO_LOOP (loop_vinfo);
5456       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5457     }
5458
5459   gcc_assert (alignment_support_scheme == dr_explicit_realign
5460               || alignment_support_scheme == dr_explicit_realign_optimized);
5461
5462   /* We need to generate three things:
5463      1. the misalignment computation
5464      2. the extra vector load (for the optimized realignment scheme).
5465      3. the phi node for the two vectors from which the realignment is
5466       done (for the optimized realignment scheme).  */
5467
5468   /* 1. Determine where to generate the misalignment computation.
5469
5470      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5471      calculation will be generated by this function, outside the loop (in the
5472      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5473      caller, inside the loop.
5474
5475      Background: If the misalignment remains fixed throughout the iterations of
5476      the loop, then both realignment schemes are applicable, and also the
5477      misalignment computation can be done outside LOOP.  This is because we are
5478      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5479      are a multiple of VS (the Vector Size), and therefore the misalignment in
5480      different vectorized LOOP iterations is always the same.
5481      The problem arises only if the memory access is in an inner-loop nested
5482      inside LOOP, which is now being vectorized using outer-loop vectorization.
5483      This is the only case when the misalignment of the memory access may not
5484      remain fixed throughout the iterations of the inner-loop (as explained in
5485      detail in vect_supportable_dr_alignment).  In this case, not only is the
5486      optimized realignment scheme not applicable, but also the misalignment
5487      computation (and generation of the realignment token that is passed to
5488      REALIGN_LOAD) have to be done inside the loop.
5489
5490      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5491      or not, which in turn determines if the misalignment is computed inside
5492      the inner-loop, or outside LOOP.  */
5493
5494   if (init_addr != NULL_TREE || !loop_vinfo)
5495     {
5496       compute_in_loop = true;
5497       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5498     }
5499
5500
5501   /* 2. Determine where to generate the extra vector load.
5502
5503      For the optimized realignment scheme, instead of generating two vector
5504      loads in each iteration, we generate a single extra vector load in the
5505      preheader of the loop, and in each iteration reuse the result of the
5506      vector load from the previous iteration.  In case the memory access is in
5507      an inner-loop nested inside LOOP, which is now being vectorized using
5508      outer-loop vectorization, we need to determine whether this initial vector
5509      load should be generated at the preheader of the inner-loop, or can be
5510      generated at the preheader of LOOP.  If the memory access has no evolution
5511      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5512      to be generated inside LOOP (in the preheader of the inner-loop).  */
5513
5514   if (nested_in_vect_loop)
5515     {
5516       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5517       bool invariant_in_outerloop =
5518             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5519       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5520     }
5521   else
5522     loop_for_initial_load = loop;
5523   if (at_loop)
5524     *at_loop = loop_for_initial_load;
5525
5526   if (loop_for_initial_load)
5527     pe = loop_preheader_edge (loop_for_initial_load);
5528
5529   /* 3. For the case of the optimized realignment, create the first vector
5530       load at the loop preheader.  */
5531
5532   if (alignment_support_scheme == dr_explicit_realign_optimized)
5533     {
5534       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5535       gassign *new_stmt;
5536
5537       gcc_assert (!compute_in_loop);
5538       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5539       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5540                                       NULL_TREE, &init_addr, NULL, &inc,
5541                                       true, &inv_p);
5542       if (TREE_CODE (ptr) == SSA_NAME)
5543         new_temp = copy_ssa_name (ptr);
5544       else
5545         new_temp = make_ssa_name (TREE_TYPE (ptr));
5546       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5547       new_stmt = gimple_build_assign
5548                    (new_temp, BIT_AND_EXPR, ptr,
5549                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5550       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5551       gcc_assert (!new_bb);
5552       data_ref
5553         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5554                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5555       vect_copy_ref_info (data_ref, DR_REF (dr));
5556       new_stmt = gimple_build_assign (vec_dest, data_ref);
5557       new_temp = make_ssa_name (vec_dest, new_stmt);
5558       gimple_assign_set_lhs (new_stmt, new_temp);
5559       if (pe)
5560         {
5561           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5562           gcc_assert (!new_bb);
5563         }
5564       else
5565          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5566
5567       msq_init = gimple_assign_lhs (new_stmt);
5568     }
5569
5570   /* 4. Create realignment token using a target builtin, if available.
5571       It is done either inside the containing loop, or before LOOP (as
5572       determined above).  */
5573
5574   if (targetm.vectorize.builtin_mask_for_load)
5575     {
5576       gcall *new_stmt;
5577       tree builtin_decl;
5578
5579       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5580       if (!init_addr)
5581         {
5582           /* Generate the INIT_ADDR computation outside LOOP.  */
5583           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5584                                                             NULL_TREE);
5585           if (loop)
5586             {
5587               pe = loop_preheader_edge (loop);
5588               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5589               gcc_assert (!new_bb);
5590             }
5591           else
5592              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5593         }
5594
5595       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5596       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5597       vec_dest =
5598         vect_create_destination_var (scalar_dest,
5599                                      gimple_call_return_type (new_stmt));
5600       new_temp = make_ssa_name (vec_dest, new_stmt);
5601       gimple_call_set_lhs (new_stmt, new_temp);
5602
5603       if (compute_in_loop)
5604         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5605       else
5606         {
5607           /* Generate the misalignment computation outside LOOP.  */
5608           pe = loop_preheader_edge (loop);
5609           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5610           gcc_assert (!new_bb);
5611         }
5612
5613       *realignment_token = gimple_call_lhs (new_stmt);
5614
5615       /* The result of the CALL_EXPR to this builtin is determined from
5616          the value of the parameter and no global variables are touched
5617          which makes the builtin a "const" function.  Requiring the
5618          builtin to have the "const" attribute makes it unnecessary
5619          to call mark_call_clobbered.  */
5620       gcc_assert (TREE_READONLY (builtin_decl));
5621     }
5622
5623   if (alignment_support_scheme == dr_explicit_realign)
5624     return msq;
5625
5626   gcc_assert (!compute_in_loop);
5627   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5628
5629
5630   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5631
5632   pe = loop_preheader_edge (containing_loop);
5633   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5634   msq = make_ssa_name (vec_dest);
5635   phi_stmt = create_phi_node (msq, containing_loop->header);
5636   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5637
5638   return msq;
5639 }
5640
5641
5642 /* Function vect_grouped_load_supported.
5643
5644    COUNT is the size of the load group (the number of statements plus the
5645    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5646    only one statement, with a gap of COUNT - 1.
5647
5648    Returns true if a suitable permute exists.  */
5649
5650 bool
5651 vect_grouped_load_supported (tree vectype, bool single_element_p,
5652                              unsigned HOST_WIDE_INT count)
5653 {
5654   machine_mode mode = TYPE_MODE (vectype);
5655
5656   /* If this is single-element interleaving with an element distance
5657      that leaves unused vector loads around punt - we at least create
5658      very sub-optimal code in that case (and blow up memory,
5659      see PR65518).  */
5660   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5661     {
5662       if (dump_enabled_p ())
5663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5664                          "single-element interleaving not supported "
5665                          "for not adjacent vector loads\n");
5666       return false;
5667     }
5668
5669   /* vect_permute_load_chain requires the group size to be equal to 3 or
5670      be a power of two.  */
5671   if (count != 3 && exact_log2 (count) == -1)
5672     {
5673       if (dump_enabled_p ())
5674         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5675                          "the size of the group of accesses"
5676                          " is not a power of 2 or not equal to 3\n");
5677       return false;
5678     }
5679
5680   /* Check that the permutation is supported.  */
5681   if (VECTOR_MODE_P (mode))
5682     {
5683       unsigned int i, j;
5684       if (count == 3)
5685         {
5686           unsigned int nelt;
5687           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5688             {
5689               if (dump_enabled_p ())
5690                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691                                  "cannot handle groups of 3 loads for"
5692                                  " variable-length vectors\n");
5693               return false;
5694             }
5695
5696           vec_perm_builder sel (nelt, nelt, 1);
5697           sel.quick_grow (nelt);
5698           vec_perm_indices indices;
5699           unsigned int k;
5700           for (k = 0; k < 3; k++)
5701             {
5702               for (i = 0; i < nelt; i++)
5703                 if (3 * i + k < 2 * nelt)
5704                   sel[i] = 3 * i + k;
5705                 else
5706                   sel[i] = 0;
5707               indices.new_vector (sel, 2, nelt);
5708               if (!can_vec_perm_const_p (mode, indices))
5709                 {
5710                   if (dump_enabled_p ())
5711                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712                                      "shuffle of 3 loads is not supported by"
5713                                      " target\n");
5714                   return false;
5715                 }
5716               for (i = 0, j = 0; i < nelt; i++)
5717                 if (3 * i + k < 2 * nelt)
5718                   sel[i] = i;
5719                 else
5720                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5721               indices.new_vector (sel, 2, nelt);
5722               if (!can_vec_perm_const_p (mode, indices))
5723                 {
5724                   if (dump_enabled_p ())
5725                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726                                      "shuffle of 3 loads is not supported by"
5727                                      " target\n");
5728                   return false;
5729                 }
5730             }
5731           return true;
5732         }
5733       else
5734         {
5735           /* If length is not equal to 3 then only power of 2 is supported.  */
5736           gcc_assert (pow2p_hwi (count));
5737           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5738
5739           /* The encoding has a single stepped pattern.  */
5740           vec_perm_builder sel (nelt, 1, 3);
5741           sel.quick_grow (3);
5742           for (i = 0; i < 3; i++)
5743             sel[i] = i * 2;
5744           vec_perm_indices indices (sel, 2, nelt);
5745           if (can_vec_perm_const_p (mode, indices))
5746             {
5747               for (i = 0; i < 3; i++)
5748                 sel[i] = i * 2 + 1;
5749               indices.new_vector (sel, 2, nelt);
5750               if (can_vec_perm_const_p (mode, indices))
5751                 return true;
5752             }
5753         }
5754     }
5755
5756   if (dump_enabled_p ())
5757     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758                      "extract even/odd not supported by target\n");
5759   return false;
5760 }
5761
5762 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5763    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5764
5765 bool
5766 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5767                            bool masked_p)
5768 {
5769   if (masked_p)
5770     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5771                                          vec_mask_load_lanes_optab,
5772                                          vectype, count);
5773   else
5774     return vect_lanes_optab_supported_p ("vec_load_lanes",
5775                                          vec_load_lanes_optab,
5776                                          vectype, count);
5777 }
5778
5779 /* Function vect_permute_load_chain.
5780
5781    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5782    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5783    the input data correctly.  Return the final references for loads in
5784    RESULT_CHAIN.
5785
5786    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5787    The input is 4 vectors each containing 8 elements. We assign a number to each
5788    element, the input sequence is:
5789
5790    1st vec:   0  1  2  3  4  5  6  7
5791    2nd vec:   8  9 10 11 12 13 14 15
5792    3rd vec:  16 17 18 19 20 21 22 23
5793    4th vec:  24 25 26 27 28 29 30 31
5794
5795    The output sequence should be:
5796
5797    1st vec:  0 4  8 12 16 20 24 28
5798    2nd vec:  1 5  9 13 17 21 25 29
5799    3rd vec:  2 6 10 14 18 22 26 30
5800    4th vec:  3 7 11 15 19 23 27 31
5801
5802    i.e., the first output vector should contain the first elements of each
5803    interleaving group, etc.
5804
5805    We use extract_even/odd instructions to create such output.  The input of
5806    each extract_even/odd operation is two vectors
5807    1st vec    2nd vec
5808    0 1 2 3    4 5 6 7
5809
5810    and the output is the vector of extracted even/odd elements.  The output of
5811    extract_even will be:   0 2 4 6
5812    and of extract_odd:     1 3 5 7
5813
5814
5815    The permutation is done in log LENGTH stages.  In each stage extract_even
5816    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5817    their order.  In our example,
5818
5819    E1: extract_even (1st vec, 2nd vec)
5820    E2: extract_odd (1st vec, 2nd vec)
5821    E3: extract_even (3rd vec, 4th vec)
5822    E4: extract_odd (3rd vec, 4th vec)
5823
5824    The output for the first stage will be:
5825
5826    E1:  0  2  4  6  8 10 12 14
5827    E2:  1  3  5  7  9 11 13 15
5828    E3: 16 18 20 22 24 26 28 30
5829    E4: 17 19 21 23 25 27 29 31
5830
5831    In order to proceed and create the correct sequence for the next stage (or
5832    for the correct output, if the second stage is the last one, as in our
5833    example), we first put the output of extract_even operation and then the
5834    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5835    The input for the second stage is:
5836
5837    1st vec (E1):  0  2  4  6  8 10 12 14
5838    2nd vec (E3): 16 18 20 22 24 26 28 30
5839    3rd vec (E2):  1  3  5  7  9 11 13 15
5840    4th vec (E4): 17 19 21 23 25 27 29 31
5841
5842    The output of the second stage:
5843
5844    E1: 0 4  8 12 16 20 24 28
5845    E2: 2 6 10 14 18 22 26 30
5846    E3: 1 5  9 13 17 21 25 29
5847    E4: 3 7 11 15 19 23 27 31
5848
5849    And RESULT_CHAIN after reordering:
5850
5851    1st vec (E1):  0 4  8 12 16 20 24 28
5852    2nd vec (E3):  1 5  9 13 17 21 25 29
5853    3rd vec (E2):  2 6 10 14 18 22 26 30
5854    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5855
5856 static void
5857 vect_permute_load_chain (vec<tree> dr_chain,
5858                          unsigned int length,
5859                          gimple *stmt,
5860                          gimple_stmt_iterator *gsi,
5861                          vec<tree> *result_chain)
5862 {
5863   tree data_ref, first_vect, second_vect;
5864   tree perm_mask_even, perm_mask_odd;
5865   tree perm3_mask_low, perm3_mask_high;
5866   gimple *perm_stmt;
5867   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5868   unsigned int i, j, log_length = exact_log2 (length);
5869
5870   result_chain->quick_grow (length);
5871   memcpy (result_chain->address (), dr_chain.address (),
5872           length * sizeof (tree));
5873
5874   if (length == 3)
5875     {
5876       /* vect_grouped_load_supported ensures that this is constant.  */
5877       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5878       unsigned int k;
5879
5880       vec_perm_builder sel (nelt, nelt, 1);
5881       sel.quick_grow (nelt);
5882       vec_perm_indices indices;
5883       for (k = 0; k < 3; k++)
5884         {
5885           for (i = 0; i < nelt; i++)
5886             if (3 * i + k < 2 * nelt)
5887               sel[i] = 3 * i + k;
5888             else
5889               sel[i] = 0;
5890           indices.new_vector (sel, 2, nelt);
5891           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5892
5893           for (i = 0, j = 0; i < nelt; i++)
5894             if (3 * i + k < 2 * nelt)
5895               sel[i] = i;
5896             else
5897               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5898           indices.new_vector (sel, 2, nelt);
5899           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5900
5901           first_vect = dr_chain[0];
5902           second_vect = dr_chain[1];
5903
5904           /* Create interleaving stmt (low part of):
5905              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5906                                                              ...}>  */
5907           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5908           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5909                                            second_vect, perm3_mask_low);
5910           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5911
5912           /* Create interleaving stmt (high part of):
5913              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5914                                                               ...}>  */
5915           first_vect = data_ref;
5916           second_vect = dr_chain[2];
5917           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5918           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5919                                            second_vect, perm3_mask_high);
5920           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5921           (*result_chain)[k] = data_ref;
5922         }
5923     }
5924   else
5925     {
5926       /* If length is not equal to 3 then only power of 2 is supported.  */
5927       gcc_assert (pow2p_hwi (length));
5928
5929       /* The encoding has a single stepped pattern.  */
5930       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5931       vec_perm_builder sel (nelt, 1, 3);
5932       sel.quick_grow (3);
5933       for (i = 0; i < 3; ++i)
5934         sel[i] = i * 2;
5935       vec_perm_indices indices (sel, 2, nelt);
5936       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5937
5938       for (i = 0; i < 3; ++i)
5939         sel[i] = i * 2 + 1;
5940       indices.new_vector (sel, 2, nelt);
5941       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5942
5943       for (i = 0; i < log_length; i++)
5944         {
5945           for (j = 0; j < length; j += 2)
5946             {
5947               first_vect = dr_chain[j];
5948               second_vect = dr_chain[j+1];
5949
5950               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5951               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5952               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5953                                                first_vect, second_vect,
5954                                                perm_mask_even);
5955               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5956               (*result_chain)[j/2] = data_ref;
5957
5958               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5959               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5960               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5961                                                first_vect, second_vect,
5962                                                perm_mask_odd);
5963               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5964               (*result_chain)[j/2+length/2] = data_ref;
5965             }
5966           memcpy (dr_chain.address (), result_chain->address (),
5967                   length * sizeof (tree));
5968         }
5969     }
5970 }
5971
5972 /* Function vect_shift_permute_load_chain.
5973
5974    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5975    sequence of stmts to reorder the input data accordingly.
5976    Return the final references for loads in RESULT_CHAIN.
5977    Return true if successed, false otherwise.
5978
5979    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5980    The input is 3 vectors each containing 8 elements.  We assign a
5981    number to each element, the input sequence is:
5982
5983    1st vec:   0  1  2  3  4  5  6  7
5984    2nd vec:   8  9 10 11 12 13 14 15
5985    3rd vec:  16 17 18 19 20 21 22 23
5986
5987    The output sequence should be:
5988
5989    1st vec:  0 3 6  9 12 15 18 21
5990    2nd vec:  1 4 7 10 13 16 19 22
5991    3rd vec:  2 5 8 11 14 17 20 23
5992
5993    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5994
5995    First we shuffle all 3 vectors to get correct elements order:
5996
5997    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5998    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5999    3rd vec:  (16 19 22) (17 20 23) (18 21)
6000
6001    Next we unite and shift vector 3 times:
6002
6003    1st step:
6004      shift right by 6 the concatenation of:
6005      "1st vec" and  "2nd vec"
6006        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6007      "2nd vec" and  "3rd vec"
6008        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6009      "3rd vec" and  "1st vec"
6010        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6011                              | New vectors                   |
6012
6013      So that now new vectors are:
6014
6015      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6016      2nd vec:  (10 13) (16 19 22) (17 20 23)
6017      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6018
6019    2nd step:
6020      shift right by 5 the concatenation of:
6021      "1st vec" and  "3rd vec"
6022        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6023      "2nd vec" and  "1st vec"
6024        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6025      "3rd vec" and  "2nd vec"
6026        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6027                           | New vectors                   |
6028
6029      So that now new vectors are:
6030
6031      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6032      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6033      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6034
6035    3rd step:
6036      shift right by 5 the concatenation of:
6037      "1st vec" and  "1st vec"
6038        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6039      shift right by 3 the concatenation of:
6040      "2nd vec" and  "2nd vec"
6041                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6042                           | New vectors                   |
6043
6044      So that now all vectors are READY:
6045      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6046      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6047      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6048
6049    This algorithm is faster than one in vect_permute_load_chain if:
6050      1.  "shift of a concatination" is faster than general permutation.
6051          This is usually so.
6052      2.  The TARGET machine can't execute vector instructions in parallel.
6053          This is because each step of the algorithm depends on previous.
6054          The algorithm in vect_permute_load_chain is much more parallel.
6055
6056    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6057 */
6058
6059 static bool
6060 vect_shift_permute_load_chain (vec<tree> dr_chain,
6061                                unsigned int length,
6062                                gimple *stmt,
6063                                gimple_stmt_iterator *gsi,
6064                                vec<tree> *result_chain)
6065 {
6066   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6067   tree perm2_mask1, perm2_mask2, perm3_mask;
6068   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6069   gimple *perm_stmt;
6070
6071   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6072   unsigned int i;
6073   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6074   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6075
6076   unsigned HOST_WIDE_INT nelt, vf;
6077   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6078       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6079     /* Not supported for variable-length vectors.  */
6080     return false;
6081
6082   vec_perm_builder sel (nelt, nelt, 1);
6083   sel.quick_grow (nelt);
6084
6085   result_chain->quick_grow (length);
6086   memcpy (result_chain->address (), dr_chain.address (),
6087           length * sizeof (tree));
6088
6089   if (pow2p_hwi (length) && vf > 4)
6090     {
6091       unsigned int j, log_length = exact_log2 (length);
6092       for (i = 0; i < nelt / 2; ++i)
6093         sel[i] = i * 2;
6094       for (i = 0; i < nelt / 2; ++i)
6095         sel[nelt / 2 + i] = i * 2 + 1;
6096       vec_perm_indices indices (sel, 2, nelt);
6097       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6098         {
6099           if (dump_enabled_p ())
6100             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6101                              "shuffle of 2 fields structure is not \
6102                               supported by target\n");
6103           return false;
6104         }
6105       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6106
6107       for (i = 0; i < nelt / 2; ++i)
6108         sel[i] = i * 2 + 1;
6109       for (i = 0; i < nelt / 2; ++i)
6110         sel[nelt / 2 + i] = i * 2;
6111       indices.new_vector (sel, 2, nelt);
6112       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6113         {
6114           if (dump_enabled_p ())
6115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                              "shuffle of 2 fields structure is not \
6117                               supported by target\n");
6118           return false;
6119         }
6120       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6121
6122       /* Generating permutation constant to shift all elements.
6123          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6124       for (i = 0; i < nelt; i++)
6125         sel[i] = nelt / 2 + i;
6126       indices.new_vector (sel, 2, nelt);
6127       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6128         {
6129           if (dump_enabled_p ())
6130             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131                              "shift permutation is not supported by target\n");
6132           return false;
6133         }
6134       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6135
6136       /* Generating permutation constant to select vector from 2.
6137          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6138       for (i = 0; i < nelt / 2; i++)
6139         sel[i] = i;
6140       for (i = nelt / 2; i < nelt; i++)
6141         sel[i] = nelt + i;
6142       indices.new_vector (sel, 2, nelt);
6143       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6144         {
6145           if (dump_enabled_p ())
6146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147                              "select is not supported by target\n");
6148           return false;
6149         }
6150       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6151
6152       for (i = 0; i < log_length; i++)
6153         {
6154           for (j = 0; j < length; j += 2)
6155             {
6156               first_vect = dr_chain[j];
6157               second_vect = dr_chain[j + 1];
6158
6159               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6160               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6161                                                first_vect, first_vect,
6162                                                perm2_mask1);
6163               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6164               vect[0] = data_ref;
6165
6166               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168                                                second_vect, second_vect,
6169                                                perm2_mask2);
6170               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171               vect[1] = data_ref;
6172
6173               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6174               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175                                                vect[0], vect[1], shift1_mask);
6176               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6177               (*result_chain)[j/2 + length/2] = data_ref;
6178
6179               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6180               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181                                                vect[0], vect[1], select_mask);
6182               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6183               (*result_chain)[j/2] = data_ref;
6184             }
6185           memcpy (dr_chain.address (), result_chain->address (),
6186                   length * sizeof (tree));
6187         }
6188       return true;
6189     }
6190   if (length == 3 && vf > 2)
6191     {
6192       unsigned int k = 0, l = 0;
6193
6194       /* Generating permutation constant to get all elements in rigth order.
6195          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6196       for (i = 0; i < nelt; i++)
6197         {
6198           if (3 * k + (l % 3) >= nelt)
6199             {
6200               k = 0;
6201               l += (3 - (nelt % 3));
6202             }
6203           sel[i] = 3 * k + (l % 3);
6204           k++;
6205         }
6206       vec_perm_indices indices (sel, 2, nelt);
6207       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6208         {
6209           if (dump_enabled_p ())
6210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211                              "shuffle of 3 fields structure is not \
6212                               supported by target\n");
6213           return false;
6214         }
6215       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6216
6217       /* Generating permutation constant to shift all elements.
6218          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6219       for (i = 0; i < nelt; i++)
6220         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6221       indices.new_vector (sel, 2, nelt);
6222       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6223         {
6224           if (dump_enabled_p ())
6225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226                              "shift permutation is not supported by target\n");
6227           return false;
6228         }
6229       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6230
6231       /* Generating permutation constant to shift all elements.
6232          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6233       for (i = 0; i < nelt; i++)
6234         sel[i] = 2 * (nelt / 3) + 1 + i;
6235       indices.new_vector (sel, 2, nelt);
6236       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6237         {
6238           if (dump_enabled_p ())
6239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6240                              "shift permutation is not supported by target\n");
6241           return false;
6242         }
6243       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6244
6245       /* Generating permutation constant to shift all elements.
6246          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6247       for (i = 0; i < nelt; i++)
6248         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6249       indices.new_vector (sel, 2, nelt);
6250       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6251         {
6252           if (dump_enabled_p ())
6253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254                              "shift permutation is not supported by target\n");
6255           return false;
6256         }
6257       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6258
6259       /* Generating permutation constant to shift all elements.
6260          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6261       for (i = 0; i < nelt; i++)
6262         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6263       indices.new_vector (sel, 2, nelt);
6264       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6265         {
6266           if (dump_enabled_p ())
6267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6268                              "shift permutation is not supported by target\n");
6269           return false;
6270         }
6271       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6272
6273       for (k = 0; k < 3; k++)
6274         {
6275           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6276           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6277                                            dr_chain[k], dr_chain[k],
6278                                            perm3_mask);
6279           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6280           vect[k] = data_ref;
6281         }
6282
6283       for (k = 0; k < 3; k++)
6284         {
6285           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6286           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6287                                            vect[k % 3], vect[(k + 1) % 3],
6288                                            shift1_mask);
6289           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6290           vect_shift[k] = data_ref;
6291         }
6292
6293       for (k = 0; k < 3; k++)
6294         {
6295           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6296           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6297                                            vect_shift[(4 - k) % 3],
6298                                            vect_shift[(3 - k) % 3],
6299                                            shift2_mask);
6300           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301           vect[k] = data_ref;
6302         }
6303
6304       (*result_chain)[3 - (nelt % 3)] = vect[2];
6305
6306       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6307       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6308                                        vect[0], shift3_mask);
6309       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6310       (*result_chain)[nelt % 3] = data_ref;
6311
6312       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6313       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6314                                        vect[1], shift4_mask);
6315       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316       (*result_chain)[0] = data_ref;
6317       return true;
6318     }
6319   return false;
6320 }
6321
6322 /* Function vect_transform_grouped_load.
6323
6324    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6325    to perform their permutation and ascribe the result vectorized statements to
6326    the scalar statements.
6327 */
6328
6329 void
6330 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6331                              gimple_stmt_iterator *gsi)
6332 {
6333   machine_mode mode;
6334   vec<tree> result_chain = vNULL;
6335
6336   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6337      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6338      vectors, that are ready for vector computation.  */
6339   result_chain.create (size);
6340
6341   /* If reassociation width for vector type is 2 or greater target machine can
6342      execute 2 or more vector instructions in parallel.  Otherwise try to
6343      get chain for loads group using vect_shift_permute_load_chain.  */
6344   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6345   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6346       || pow2p_hwi (size)
6347       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6348                                          gsi, &result_chain))
6349     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6350   vect_record_grouped_load_vectors (stmt, result_chain);
6351   result_chain.release ();
6352 }
6353
6354 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6355    generated as part of the vectorization of STMT.  Assign the statement
6356    for each vector to the associated scalar statement.  */
6357
6358 void
6359 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6360 {
6361   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6362   gimple *next_stmt, *new_stmt;
6363   unsigned int i, gap_count;
6364   tree tmp_data_ref;
6365
6366   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6367      Since we scan the chain starting from it's first node, their order
6368      corresponds the order of data-refs in RESULT_CHAIN.  */
6369   next_stmt = first_stmt;
6370   gap_count = 1;
6371   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6372     {
6373       if (!next_stmt)
6374         break;
6375
6376       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6377        code elimination pass later.  No need to check for the first stmt in
6378        the group, since it always exists.
6379        DR_GROUP_GAP is the number of steps in elements from the previous
6380        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6381        correspond to the gaps.  */
6382       if (next_stmt != first_stmt
6383           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6384       {
6385         gap_count++;
6386         continue;
6387       }
6388
6389       while (next_stmt)
6390         {
6391           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6392           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6393              copies, and we put the new vector statement in the first available
6394              RELATED_STMT.  */
6395           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6396             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6397           else
6398             {
6399               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6400                 {
6401                   gimple *prev_stmt =
6402                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6403                   gimple *rel_stmt =
6404                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6405                   while (rel_stmt)
6406                     {
6407                       prev_stmt = rel_stmt;
6408                       rel_stmt =
6409                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6410                     }
6411
6412                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6413                     new_stmt;
6414                 }
6415             }
6416
6417           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6418           gap_count = 1;
6419           /* If NEXT_STMT accesses the same DR as the previous statement,
6420              put the same TMP_DATA_REF as its vectorized statement; otherwise
6421              get the next data-ref from RESULT_CHAIN.  */
6422           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6423             break;
6424         }
6425     }
6426 }
6427
6428 /* Function vect_force_dr_alignment_p.
6429
6430    Returns whether the alignment of a DECL can be forced to be aligned
6431    on ALIGNMENT bit boundary.  */
6432
6433 bool
6434 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6435 {
6436   if (!VAR_P (decl))
6437     return false;
6438
6439   if (decl_in_symtab_p (decl)
6440       && !symtab_node::get (decl)->can_increase_alignment_p ())
6441     return false;
6442
6443   if (TREE_STATIC (decl))
6444     return (alignment <= MAX_OFILE_ALIGNMENT);
6445   else
6446     return (alignment <= MAX_STACK_ALIGNMENT);
6447 }
6448
6449
6450 /* Return whether the data reference DR is supported with respect to its
6451    alignment.
6452    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6453    it is aligned, i.e., check if it is possible to vectorize it with different
6454    alignment.  */
6455
6456 enum dr_alignment_support
6457 vect_supportable_dr_alignment (struct data_reference *dr,
6458                                bool check_aligned_accesses)
6459 {
6460   gimple *stmt = vect_dr_stmt (dr);
6461   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6462   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6463   machine_mode mode = TYPE_MODE (vectype);
6464   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6465   struct loop *vect_loop = NULL;
6466   bool nested_in_vect_loop = false;
6467
6468   if (aligned_access_p (dr) && !check_aligned_accesses)
6469     return dr_aligned;
6470
6471   /* For now assume all conditional loads/stores support unaligned
6472      access without any special code.  */
6473   if (is_gimple_call (stmt)
6474       && gimple_call_internal_p (stmt)
6475       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6476           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6477     return dr_unaligned_supported;
6478
6479   if (loop_vinfo)
6480     {
6481       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6482       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6483     }
6484
6485   /* Possibly unaligned access.  */
6486
6487   /* We can choose between using the implicit realignment scheme (generating
6488      a misaligned_move stmt) and the explicit realignment scheme (generating
6489      aligned loads with a REALIGN_LOAD).  There are two variants to the
6490      explicit realignment scheme: optimized, and unoptimized.
6491      We can optimize the realignment only if the step between consecutive
6492      vector loads is equal to the vector size.  Since the vector memory
6493      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6494      is guaranteed that the misalignment amount remains the same throughout the
6495      execution of the vectorized loop.  Therefore, we can create the
6496      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6497      at the loop preheader.
6498
6499      However, in the case of outer-loop vectorization, when vectorizing a
6500      memory access in the inner-loop nested within the LOOP that is now being
6501      vectorized, while it is guaranteed that the misalignment of the
6502      vectorized memory access will remain the same in different outer-loop
6503      iterations, it is *not* guaranteed that is will remain the same throughout
6504      the execution of the inner-loop.  This is because the inner-loop advances
6505      with the original scalar step (and not in steps of VS).  If the inner-loop
6506      step happens to be a multiple of VS, then the misalignment remains fixed
6507      and we can use the optimized realignment scheme.  For example:
6508
6509       for (i=0; i<N; i++)
6510         for (j=0; j<M; j++)
6511           s += a[i+j];
6512
6513      When vectorizing the i-loop in the above example, the step between
6514      consecutive vector loads is 1, and so the misalignment does not remain
6515      fixed across the execution of the inner-loop, and the realignment cannot
6516      be optimized (as illustrated in the following pseudo vectorized loop):
6517
6518       for (i=0; i<N; i+=4)
6519         for (j=0; j<M; j++){
6520           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6521                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6522                          // (assuming that we start from an aligned address).
6523           }
6524
6525      We therefore have to use the unoptimized realignment scheme:
6526
6527       for (i=0; i<N; i+=4)
6528           for (j=k; j<M; j+=4)
6529           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6530                            // that the misalignment of the initial address is
6531                            // 0).
6532
6533      The loop can then be vectorized as follows:
6534
6535       for (k=0; k<4; k++){
6536         rt = get_realignment_token (&vp[k]);
6537         for (i=0; i<N; i+=4){
6538           v1 = vp[i+k];
6539           for (j=k; j<M; j+=4){
6540             v2 = vp[i+j+VS-1];
6541             va = REALIGN_LOAD <v1,v2,rt>;
6542             vs += va;
6543             v1 = v2;
6544           }
6545         }
6546     } */
6547
6548   if (DR_IS_READ (dr))
6549     {
6550       bool is_packed = false;
6551       tree type = (TREE_TYPE (DR_REF (dr)));
6552
6553       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6554           && (!targetm.vectorize.builtin_mask_for_load
6555               || targetm.vectorize.builtin_mask_for_load ()))
6556         {
6557           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6558
6559           /* If we are doing SLP then the accesses need not have the
6560              same alignment, instead it depends on the SLP group size.  */
6561           if (loop_vinfo
6562               && STMT_SLP_TYPE (stmt_info)
6563               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6564                               * DR_GROUP_SIZE (vinfo_for_stmt
6565                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6566                               TYPE_VECTOR_SUBPARTS (vectype)))
6567             ;
6568           else if (!loop_vinfo
6569                    || (nested_in_vect_loop
6570                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6571                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6572             return dr_explicit_realign;
6573           else
6574             return dr_explicit_realign_optimized;
6575         }
6576       if (!known_alignment_for_access_p (dr))
6577         is_packed = not_size_aligned (DR_REF (dr));
6578
6579       if (targetm.vectorize.support_vector_misalignment
6580             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6581         /* Can't software pipeline the loads, but can at least do them.  */
6582         return dr_unaligned_supported;
6583     }
6584   else
6585     {
6586       bool is_packed = false;
6587       tree type = (TREE_TYPE (DR_REF (dr)));
6588
6589       if (!known_alignment_for_access_p (dr))
6590         is_packed = not_size_aligned (DR_REF (dr));
6591
6592      if (targetm.vectorize.support_vector_misalignment
6593            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6594        return dr_unaligned_supported;
6595     }
6596
6597   /* Unsupported.  */
6598   return dr_unaligned_unsupported;
6599 }