gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return (!STMT_VINFO_GATHER_SCATTER_P
 271                      (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
 272                   && !STMT_VINFO_GATHER_SCATTER_P
 273                         (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
 298   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
 299   lambda_vector dist_v;
 300   unsigned int loop_depth;
 301
 302   /* In loop analysis all data references should be vectorizable.  */
 303   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 304       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 305     gcc_unreachable ();
 306
 307   /* Independent data accesses.  */
 308   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 309     return false;
 310
 311   if (dra == drb
 312       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 313     return false;
 314
 315   /* We do not have to consider dependences between accesses that belong
 316      to the same group, unless the stride could be smaller than the
 317      group size.  */
 318   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 319       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 321       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 322     return false;
 323
 324   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 325      least two scalar iterations, there is always also a true dependence.
 326      As the vectorizer does not re-order loads and stores we can ignore
 327      the anti-dependence if TBAA can disambiguate both DRs similar to the
 328      case with known negative distance anti-dependences (positive
 329      distance anti-dependences would violate TBAA constraints).  */
 330   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 331        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 332       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 333                                  get_alias_set (DR_REF (drb))))
 334     return false;
 335
 336   /* Unknown data dependence.  */
 337   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 338     {
 339       /* If user asserted safelen consecutive iterations can be
 340          executed concurrently, assume independence.  */
 341       if (loop->safelen >= 2)
 342         {
 343           if ((unsigned int) loop->safelen < *max_vf)
 344             *max_vf = loop->safelen;
 345           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 346           return false;
 347         }
 348
 349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 350           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                                "versioning for alias not supported for: "
 356                                "can't determine dependence between ");
 357               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 358                                  DR_REF (dra));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 360               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 361                                  DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364           return true;
 365         }
 366
 367       if (dump_enabled_p ())
 368         {
 369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                            "versioning for alias required: "
 371                            "can't determine dependence between ");
 372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 373                              DR_REF (dra));
 374           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 375           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 376                              DR_REF (drb));
 377           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 378         }
 379
 380       /* Add to list of ddrs that need to be tested at run-time.  */
 381       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 382     }
 383
 384   /* Known data dependence.  */
 385   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 386     {
 387       /* If user asserted safelen consecutive iterations can be
 388          executed concurrently, assume independence.  */
 389       if (loop->safelen >= 2)
 390         {
 391           if ((unsigned int) loop->safelen < *max_vf)
 392             *max_vf = loop->safelen;
 393           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 394           return false;
 395         }
 396
 397       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 398           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 399         {
 400           if (dump_enabled_p ())
 401             {
 402               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                "versioning for alias not supported for: "
 404                                "bad dist vector for ");
 405               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                  DR_REF (dra));
 407               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 408               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 409                                  DR_REF (drb));
 410               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411             }
 412           return true;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                            "versioning for alias required: "
 419                            "bad dist vector for ");
 420           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 423           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 424         }
 425       /* Add to list of ddrs that need to be tested at run-time.  */
 426       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 427     }
 428
 429   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 430
 431   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 432       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 433                                                 loop_depth, max_vf))
 434     return false;
 435
 436   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 437     {
 438       int dist = dist_v[loop_depth];
 439
 440       if (dump_enabled_p ())
 441         dump_printf_loc (MSG_NOTE, vect_location,
 442                          "dependence distance  = %d.\n", dist);
 443
 444       if (dist == 0)
 445         {
 446           if (dump_enabled_p ())
 447             {
 448               dump_printf_loc (MSG_NOTE, vect_location,
 449                                "dependence distance == 0 between ");
 450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 451               dump_printf (MSG_NOTE, " and ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 453               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 454             }
 455
 456           /* When we perform grouped accesses and perform implicit CSE
 457              by detecting equal accesses and doing disambiguation with
 458              runtime alias tests like for
 459                 .. = a[i];
 460                 .. = a[i+1];
 461                 a[i] = ..;
 462                 a[i+1] = ..;
 463                 *p = ..;
 464                 .. = a[i];
 465                 .. = a[i+1];
 466              where we will end up loading { a[i], a[i+1] } once, make
 467              sure that inserting group loads before the first load and
 468              stores after the last store will do the right thing.
 469              Similar for groups like
 470                 a[i] = ...;
 471                 ... = a[i];
 472                 a[i+1] = ...;
 473              where loads from the group interleave with the store.  */
 474           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 475                                               vect_dr_stmt (drb)))
 476             {
 477               if (dump_enabled_p ())
 478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 479                                  "READ_WRITE dependence in interleaving.\n");
 480               return true;
 481             }
 482
 483           if (loop->safelen < 2)
 484             {
 485               tree indicator = dr_zero_step_indicator (dra);
 486               if (!indicator || integer_zerop (indicator))
 487                 {
 488                   if (dump_enabled_p ())
 489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                                  "access also has a zero step\n");
 491                   return true;
 492                 }
 493               else if (TREE_CODE (indicator) != INTEGER_CST)
 494                 vect_check_nonzero_value (loop_vinfo, indicator);
 495             }
 496           continue;
 497         }
 498
 499       if (dist > 0 && DDR_REVERSED_P (ddr))
 500         {
 501           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 502              reversed (to make distance vector positive), and the actual
 503              distance is negative.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 506                              "dependence distance negative.\n");
 507           /* Record a negative dependence distance to later limit the
 508              amount of stmt copying / unrolling we can perform.
 509              Only need to handle read-after-write dependence.  */
 510           if (DR_IS_READ (drb)
 511               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 512                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 513             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 514           continue;
 515         }
 516
 517       unsigned int abs_dist = abs (dist);
 518       if (abs_dist >= 2 && abs_dist < *max_vf)
 519         {
 520           /* The dependence distance requires reduction of the maximal
 521              vectorization factor.  */
 522           *max_vf = abs (dist);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "adjusting maximal vectorization factor to %i\n",
 526                              *max_vf);
 527         }
 528
 529       if (abs_dist >= *max_vf)
 530         {
 531           /* Dependence distance does not create dependence, as far as
 532              vectorization is concerned, in this case.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance >= VF.\n");
 536           continue;
 537         }
 538
 539       if (dump_enabled_p ())
 540         {
 541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 542                        "not vectorized, possible dependence "
 543                        "between data-refs ");
 544           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_NOTE,  " and ");
 546           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_NOTE,  "\n");
 548         }
 549
 550       return true;
 551     }
 552
 553   return false;
 554 }
 555
 556 /* Function vect_analyze_data_ref_dependences.
 557
 558    Examine all the data references in the loop, and make sure there do not
 559    exist any data dependences between them.  Set *MAX_VF according to
 560    the maximum vectorization factor the data dependences allow.  */
 561
 562 bool
 563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 564                                    unsigned int *max_vf)
 565 {
 566   unsigned int i;
 567   struct data_dependence_relation *ddr;
 568
 569   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 570
 571   LOOP_VINFO_DDRS (loop_vinfo)
 572     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 573              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 574   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 575   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 576   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 577                                 &LOOP_VINFO_DDRS (loop_vinfo),
 578                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 579     return false;
 580
 581   /* For epilogues we either have no aliases or alias versioning
 582      was applied to original loop.  Therefore we may just get max_vf
 583      using VF of original loop.  */
 584   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 585     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 586   else
 587     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 588       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 589         return false;
 590
 591   return true;
 592 }
 593
 594
 595 /* Function vect_slp_analyze_data_ref_dependence.
 596
 597    Return TRUE if there (might) exist a dependence between a memory-reference
 598    DRA and a memory-reference DRB.  When versioning for alias may check a
 599    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 600    the data dependence.  */
 601
 602 static bool
 603 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 604 {
 605   struct data_reference *dra = DDR_A (ddr);
 606   struct data_reference *drb = DDR_B (ddr);
 607
 608   /* We need to check dependences of statements marked as unvectorizable
 609      as well, they still can prohibit vectorization.  */
 610
 611   /* Independent data accesses.  */
 612   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 613     return false;
 614
 615   if (dra == drb)
 616     return false;
 617
 618   /* Read-read is OK.  */
 619   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 620     return false;
 621
 622   /* If dra and drb are part of the same interleaving chain consider
 623      them independent.  */
 624   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
 625       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
 626           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
 627     return false;
 628
 629   /* Unknown data dependence.  */
 630   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 631     {
 632       if  (dump_enabled_p ())
 633         {
 634           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 635                            "can't determine dependence between ");
 636           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 637           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 638           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 639           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 640         }
 641     }
 642   else if (dump_enabled_p ())
 643     {
 644       dump_printf_loc (MSG_NOTE, vect_location,
 645                        "determined dependence between ");
 646       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 647       dump_printf (MSG_NOTE, " and ");
 648       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 649       dump_printf (MSG_NOTE,  "\n");
 650     }
 651
 652   return true;
 653 }
 654
 655
 656 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 657    contain the vector of scalar stores of this instance if we are
 658    disambiguating the loads.  */
 659
 660 static bool
 661 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 662                                    vec<gimple *> stores, gimple *last_store)
 663 {
 664   /* This walks over all stmts involved in the SLP load/store done
 665      in NODE verifying we can sink them up to the last stmt in the
 666      group.  */
 667   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 668   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 669     {
 670       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 671       if (access == last_access)
 672         continue;
 673       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 674       ao_ref ref;
 675       bool ref_initialized_p = false;
 676       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 677            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 678         {
 679           gimple *stmt = gsi_stmt (gsi);
 680           if (! gimple_vuse (stmt)
 681               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 682             continue;
 683
 684           /* If we couldn't record a (single) data reference for this
 685              stmt we have to resort to the alias oracle.  */
 686           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 687           if (!dr_b)
 688             {
 689               /* We are moving a store or sinking a load - this means
 690                  we cannot use TBAA for disambiguation.  */
 691               if (!ref_initialized_p)
 692                 ao_ref_init (&ref, DR_REF (dr_a));
 693               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 694                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 695                 return false;
 696               continue;
 697             }
 698
 699           bool dependent = false;
 700           /* If we run into a store of this same instance (we've just
 701              marked those) then delay dependence checking until we run
 702              into the last store because this is where it will have
 703              been sunk to (and we verify if we can do that as well).  */
 704           if (gimple_visited_p (stmt))
 705             {
 706               if (stmt != last_store)
 707                 continue;
 708               unsigned i;
 709               gimple *store;
 710               FOR_EACH_VEC_ELT (stores, i, store)
 711                 {
 712                   data_reference *store_dr
 713                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 714                   ddr_p ddr = initialize_data_dependence_relation
 715                                 (dr_a, store_dr, vNULL);
 716                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 717                   free_dependence_relation (ddr);
 718                   if (dependent)
 719                     break;
 720                 }
 721             }
 722           else
 723             {
 724               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 725                                                                dr_b, vNULL);
 726               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 727               free_dependence_relation (ddr);
 728             }
 729           if (dependent)
 730             return false;
 731         }
 732     }
 733   return true;
 734 }
 735
 736
 737 /* Function vect_analyze_data_ref_dependences.
 738
 739    Examine all the data references in the basic-block, and make sure there
 740    do not exist any data dependences between them.  Set *MAX_VF according to
 741    the maximum vectorization factor the data dependences allow.  */
 742
 743 bool
 744 vect_slp_analyze_instance_dependence (slp_instance instance)
 745 {
 746   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 747
 748   /* The stores of this instance are at the root of the SLP tree.  */
 749   slp_tree store = SLP_INSTANCE_TREE (instance);
 750   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 751     store = NULL;
 752
 753   /* Verify we can sink stores to the vectorized stmt insert location.  */
 754   gimple *last_store = NULL;
 755   if (store)
 756     {
 757       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 758         return false;
 759
 760       /* Mark stores in this instance and remember the last one.  */
 761       last_store = vect_find_last_scalar_stmt_in_slp (store);
 762       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 763         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 764     }
 765
 766   bool res = true;
 767
 768   /* Verify we can sink loads to the vectorized stmt insert location,
 769      special-casing stores of this instance.  */
 770   slp_tree load;
 771   unsigned int i;
 772   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 773     if (! vect_slp_analyze_node_dependences (instance, load,
 774                                              store
 775                                              ? SLP_TREE_SCALAR_STMTS (store)
 776                                              : vNULL, last_store))
 777       {
 778         res = false;
 779         break;
 780       }
 781
 782   /* Unset the visited flag.  */
 783   if (store)
 784     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 785       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 786
 787   return res;
 788 }
 789
 790 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 791    the statement that contains DRB, which is useful for recording in the
 792    dump file.  */
 793
 794 static void
 795 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 796                             innermost_loop_behavior *drb)
 797 {
 798   bool existed;
 799   innermost_loop_behavior *&entry
 800     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 801   if (!existed || entry->base_alignment < drb->base_alignment)
 802     {
 803       entry = drb;
 804       if (dump_enabled_p ())
 805         {
 806           dump_printf_loc (MSG_NOTE, vect_location,
 807                            "recording new base alignment for ");
 808           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 809           dump_printf (MSG_NOTE, "\n");
 810           dump_printf_loc (MSG_NOTE, vect_location,
 811                            "  alignment:    %d\n", drb->base_alignment);
 812           dump_printf_loc (MSG_NOTE, vect_location,
 813                            "  misalignment: %d\n", drb->base_misalignment);
 814           dump_printf_loc (MSG_NOTE, vect_location,
 815                            "  based on:     ");
 816           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 817         }
 818     }
 819 }
 820
 821 /* If the region we're going to vectorize is reached, all unconditional
 822    data references occur at least once.  We can therefore pool the base
 823    alignment guarantees from each unconditional reference.  Do this by
 824    going through all the data references in VINFO and checking whether
 825    the containing statement makes the reference unconditionally.  If so,
 826    record the alignment of the base address in VINFO so that it can be
 827    used for all other references with the same base.  */
 828
 829 void
 830 vect_record_base_alignments (vec_info *vinfo)
 831 {
 832   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 833   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 834   data_reference *dr;
 835   unsigned int i;
 836   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 837     {
 838       gimple *stmt = vect_dr_stmt (dr);
 839       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 840       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 841           && STMT_VINFO_VECTORIZABLE (stmt_info)
 842           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 843         {
 844           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 845
 846           /* If DR is nested in the loop that is being vectorized, we can also
 847              record the alignment of the base wrt the outer loop.  */
 848           if (loop && nested_in_vect_loop_p (loop, stmt))
 849             vect_record_base_alignment
 850                 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 851         }
 852     }
 853 }
 854
 855 /* Return the target alignment for the vectorized form of DR.  */
 856
 857 static unsigned int
 858 vect_calculate_target_alignment (struct data_reference *dr)
 859 {
 860   gimple *stmt = vect_dr_stmt (dr);
 861   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 862   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 863   return targetm.vectorize.preferred_vector_alignment (vectype);
 864 }
 865
 866 /* Function vect_compute_data_ref_alignment
 867
 868    Compute the misalignment of the data reference DR.
 869
 870    Output:
 871    1. DR_MISALIGNMENT (DR) is defined.
 872
 873    FOR NOW: No analysis is actually performed. Misalignment is calculated
 874    only for trivial cases. TODO.  */
 875
 876 static void
 877 vect_compute_data_ref_alignment (struct data_reference *dr)
 878 {
 879   gimple *stmt = vect_dr_stmt (dr);
 880   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 881   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 882   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 883   struct loop *loop = NULL;
 884   tree ref = DR_REF (dr);
 885   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 886
 887   if (dump_enabled_p ())
 888     dump_printf_loc (MSG_NOTE, vect_location,
 889                      "vect_compute_data_ref_alignment:\n");
 890
 891   if (loop_vinfo)
 892     loop = LOOP_VINFO_LOOP (loop_vinfo);
 893
 894   /* Initialize misalignment to unknown.  */
 895   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 896
 897   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 898     return;
 899
 900   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 901   bool step_preserves_misalignment_p;
 902
 903   unsigned HOST_WIDE_INT vector_alignment
 904     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 905   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 906
 907   /* No step for BB vectorization.  */
 908   if (!loop)
 909     {
 910       gcc_assert (integer_zerop (drb->step));
 911       step_preserves_misalignment_p = true;
 912     }
 913
 914   /* In case the dataref is in an inner-loop of the loop that is being
 915      vectorized (LOOP), we use the base and misalignment information
 916      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 917      stays the same throughout the execution of the inner-loop, which is why
 918      we have to check that the stride of the dataref in the inner-loop evenly
 919      divides by the vector alignment.  */
 920   else if (nested_in_vect_loop_p (loop, stmt))
 921     {
 922       step_preserves_misalignment_p
 923         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 924
 925       if (dump_enabled_p ())
 926         {
 927           if (step_preserves_misalignment_p)
 928             dump_printf_loc (MSG_NOTE, vect_location,
 929                              "inner step divides the vector alignment.\n");
 930           else
 931             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                              "inner step doesn't divide the vector"
 933                              " alignment.\n");
 934         }
 935     }
 936
 937   /* Similarly we can only use base and misalignment information relative to
 938      an innermost loop if the misalignment stays the same throughout the
 939      execution of the loop.  As above, this is the case if the stride of
 940      the dataref evenly divides by the alignment.  */
 941   else
 942     {
 943       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 944       step_preserves_misalignment_p
 945         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 946
 947       if (!step_preserves_misalignment_p && dump_enabled_p ())
 948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 949                          "step doesn't divide the vector alignment.\n");
 950     }
 951
 952   unsigned int base_alignment = drb->base_alignment;
 953   unsigned int base_misalignment = drb->base_misalignment;
 954
 955   /* Calculate the maximum of the pooled base address alignment and the
 956      alignment that we can compute for DR itself.  */
 957   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 958   if (entry && base_alignment < (*entry)->base_alignment)
 959     {
 960       base_alignment = (*entry)->base_alignment;
 961       base_misalignment = (*entry)->base_misalignment;
 962     }
 963
 964   if (drb->offset_alignment < vector_alignment
 965       || !step_preserves_misalignment_p
 966       /* We need to know whether the step wrt the vectorized loop is
 967          negative when computing the starting misalignment below.  */
 968       || TREE_CODE (drb->step) != INTEGER_CST)
 969     {
 970       if (dump_enabled_p ())
 971         {
 972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 973                            "Unknown alignment for access: ");
 974           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 975           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 976         }
 977       return;
 978     }
 979
 980   if (base_alignment < vector_alignment)
 981     {
 982       unsigned int max_alignment;
 983       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 984       if (max_alignment < vector_alignment
 985           || !vect_can_force_dr_alignment_p (base,
 986                                              vector_alignment * BITS_PER_UNIT))
 987         {
 988           if (dump_enabled_p ())
 989             {
 990               dump_printf_loc (MSG_NOTE, vect_location,
 991                                "can't force alignment of ref: ");
 992               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 993               dump_printf (MSG_NOTE, "\n");
 994             }
 995           return;
 996         }
 997
 998       /* Force the alignment of the decl.
 999          NOTE: This is the only change to the code we make during
1000          the analysis phase, before deciding to vectorize the loop.  */
1001       if (dump_enabled_p ())
1002         {
1003           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1004           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005           dump_printf (MSG_NOTE, "\n");
1006         }
1007
1008       DR_VECT_AUX (dr)->base_decl = base;
1009       DR_VECT_AUX (dr)->base_misaligned = true;
1010       base_misalignment = 0;
1011     }
1012   poly_int64 misalignment
1013     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1014
1015   /* If this is a backward running DR then first access in the larger
1016      vectype actually is N-1 elements before the address in the DR.
1017      Adjust misalign accordingly.  */
1018   if (tree_int_cst_sgn (drb->step) < 0)
1019     /* PLUS because STEP is negative.  */
1020     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1021                      * TREE_INT_CST_LOW (drb->step));
1022
1023   unsigned int const_misalignment;
1024   if (!known_misalignment (misalignment, vector_alignment,
1025                            &const_misalignment))
1026     {
1027       if (dump_enabled_p ())
1028         {
1029           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030                            "Non-constant misalignment for access: ");
1031           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1032           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1033         }
1034       return;
1035     }
1036
1037   SET_DR_MISALIGNMENT (dr, const_misalignment);
1038
1039   if (dump_enabled_p ())
1040     {
1041       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1043       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045     }
1046
1047   return;
1048 }
1049
1050 /* Function vect_update_misalignment_for_peel.
1051    Sets DR's misalignment
1052    - to 0 if it has the same alignment as DR_PEEL,
1053    - to the misalignment computed using NPEEL if DR's salignment is known,
1054    - to -1 (unknown) otherwise.
1055
1056    DR - the data reference whose misalignment is to be adjusted.
1057    DR_PEEL - the data reference whose misalignment is being made
1058              zero in the vector loop by the peel.
1059    NPEEL - the number of iterations in the peel loop if the misalignment
1060            of DR_PEEL is known at compile time.  */
1061
1062 static void
1063 vect_update_misalignment_for_peel (struct data_reference *dr,
1064                                    struct data_reference *dr_peel, int npeel)
1065 {
1066   unsigned int i;
1067   vec<dr_p> same_aligned_drs;
1068   struct data_reference *current_dr;
1069   int dr_size = vect_get_scalar_dr_size (dr);
1070   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1071   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1072   stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1073
1074  /* For interleaved data accesses the step in the loop must be multiplied by
1075      the size of the interleaving group.  */
1076   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1078   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1079     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1080
1081   /* It can be assumed that the data refs with the same alignment as dr_peel
1082      are aligned in the vector loop.  */
1083   same_aligned_drs
1084     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1085   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1086     {
1087       if (current_dr != dr)
1088         continue;
1089       gcc_assert (!known_alignment_for_access_p (dr)
1090                   || !known_alignment_for_access_p (dr_peel)
1091                   || (DR_MISALIGNMENT (dr) / dr_size
1092                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1093       SET_DR_MISALIGNMENT (dr, 0);
1094       return;
1095     }
1096
1097   if (known_alignment_for_access_p (dr)
1098       && known_alignment_for_access_p (dr_peel))
1099     {
1100       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1101       int misal = DR_MISALIGNMENT (dr);
1102       misal += negative ? -npeel * dr_size : npeel * dr_size;
1103       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1104       SET_DR_MISALIGNMENT (dr, misal);
1105       return;
1106     }
1107
1108   if (dump_enabled_p ())
1109     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1110                      "to unknown (-1).\n");
1111   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1112 }
1113
1114
1115 /* Function verify_data_ref_alignment
1116
1117    Return TRUE if DR can be handled with respect to alignment.  */
1118
1119 static bool
1120 verify_data_ref_alignment (data_reference_p dr)
1121 {
1122   enum dr_alignment_support supportable_dr_alignment
1123     = vect_supportable_dr_alignment (dr, false);
1124   if (!supportable_dr_alignment)
1125     {
1126       if (dump_enabled_p ())
1127         {
1128           if (DR_IS_READ (dr))
1129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130                              "not vectorized: unsupported unaligned load.");
1131           else
1132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                              "not vectorized: unsupported unaligned "
1134                              "store.");
1135
1136           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1137                              DR_REF (dr));
1138           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1139         }
1140       return false;
1141     }
1142
1143   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1144     dump_printf_loc (MSG_NOTE, vect_location,
1145                      "Vectorizing an unaligned access.\n");
1146
1147   return true;
1148 }
1149
1150 /* Function vect_verify_datarefs_alignment
1151
1152    Return TRUE if all data references in the loop can be
1153    handled with respect to alignment.  */
1154
1155 bool
1156 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1157 {
1158   vec<data_reference_p> datarefs = vinfo->datarefs;
1159   struct data_reference *dr;
1160   unsigned int i;
1161
1162   FOR_EACH_VEC_ELT (datarefs, i, dr)
1163     {
1164       gimple *stmt = vect_dr_stmt (dr);
1165       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1166
1167       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1168         continue;
1169
1170       /* For interleaving, only the alignment of the first access matters.   */
1171       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1172           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1173         continue;
1174
1175       /* Strided accesses perform only component accesses, alignment is
1176          irrelevant for them.  */
1177       if (STMT_VINFO_STRIDED_P (stmt_info)
1178           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1179         continue;
1180
1181       if (! verify_data_ref_alignment (dr))
1182         return false;
1183     }
1184
1185   return true;
1186 }
1187
1188 /* Given an memory reference EXP return whether its alignment is less
1189    than its size.  */
1190
1191 static bool
1192 not_size_aligned (tree exp)
1193 {
1194   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1195     return true;
1196
1197   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1198           > get_object_alignment (exp));
1199 }
1200
1201 /* Function vector_alignment_reachable_p
1202
1203    Return true if vector alignment for DR is reachable by peeling
1204    a few loop iterations.  Return false otherwise.  */
1205
1206 static bool
1207 vector_alignment_reachable_p (struct data_reference *dr)
1208 {
1209   gimple *stmt = vect_dr_stmt (dr);
1210   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1212
1213   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1214     {
1215       /* For interleaved access we peel only if number of iterations in
1216          the prolog loop ({VF - misalignment}), is a multiple of the
1217          number of the interleaved accesses.  */
1218       int elem_size, mis_in_elements;
1219
1220       /* FORNOW: handle only known alignment.  */
1221       if (!known_alignment_for_access_p (dr))
1222         return false;
1223
1224       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226       elem_size = vector_element_size (vector_size, nelements);
1227       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1228
1229       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230         return false;
1231     }
1232
1233   /* If misalignment is known at the compile time then allow peeling
1234      only if natural alignment is reachable through peeling.  */
1235   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1236     {
1237       HOST_WIDE_INT elmsize =
1238                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239       if (dump_enabled_p ())
1240         {
1241           dump_printf_loc (MSG_NOTE, vect_location,
1242                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243           dump_printf (MSG_NOTE,
1244                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1245         }
1246       if (DR_MISALIGNMENT (dr) % elmsize)
1247         {
1248           if (dump_enabled_p ())
1249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                              "data size does not divide the misalignment.\n");
1251           return false;
1252         }
1253     }
1254
1255   if (!known_alignment_for_access_p (dr))
1256     {
1257       tree type = TREE_TYPE (DR_REF (dr));
1258       bool is_packed = not_size_aligned (DR_REF (dr));
1259       if (dump_enabled_p ())
1260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261                          "Unknown misalignment, %snaturally aligned\n",
1262                          is_packed ? "not " : "");
1263       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1264     }
1265
1266   return true;
1267 }
1268
1269
1270 /* Calculate the cost of the memory access represented by DR.  */
1271
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274                            unsigned int *inside_cost,
1275                            unsigned int *outside_cost,
1276                            stmt_vector_for_cost *body_cost_vec,
1277                            stmt_vector_for_cost *prologue_cost_vec)
1278 {
1279   gimple *stmt = vect_dr_stmt (dr);
1280   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1281   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282   int ncopies;
1283
1284   if (PURE_SLP_STMT (stmt_info))
1285     ncopies = 1;
1286   else
1287     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1288
1289   if (DR_IS_READ (dr))
1290     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291                         prologue_cost_vec, body_cost_vec, false);
1292   else
1293     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1294
1295   if (dump_enabled_p ())
1296     dump_printf_loc (MSG_NOTE, vect_location,
1297                      "vect_get_data_access_cost: inside_cost = %d, "
1298                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1299 }
1300
1301
1302 typedef struct _vect_peel_info
1303 {
1304   struct data_reference *dr;
1305   int npeel;
1306   unsigned int count;
1307 } *vect_peel_info;
1308
1309 typedef struct _vect_peel_extended_info
1310 {
1311   struct _vect_peel_info peel_info;
1312   unsigned int inside_cost;
1313   unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1315
1316
1317 /* Peeling hashtable helpers.  */
1318
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1320 {
1321   static inline hashval_t hash (const _vect_peel_info *);
1322   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1323 };
1324
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1327 {
1328   return (hashval_t) peel_info->npeel;
1329 }
1330
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1333 {
1334   return (a->npeel == b->npeel);
1335 }
1336
1337
1338 /* Insert DR into peeling hash table with NPEEL as key.  */
1339
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342                           loop_vec_info loop_vinfo, struct data_reference *dr,
1343                           int npeel)
1344 {
1345   struct _vect_peel_info elem, *slot;
1346   _vect_peel_info **new_slot;
1347   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1348
1349   elem.npeel = npeel;
1350   slot = peeling_htab->find (&elem);
1351   if (slot)
1352     slot->count++;
1353   else
1354     {
1355       slot = XNEW (struct _vect_peel_info);
1356       slot->npeel = npeel;
1357       slot->dr = dr;
1358       slot->count = 1;
1359       new_slot = peeling_htab->find_slot (slot, INSERT);
1360       *new_slot = slot;
1361     }
1362
1363   if (!supportable_dr_alignment
1364       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365     slot->count += VECT_MAX_COST;
1366 }
1367
1368
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370    number of data accesses.  */
1371
1372 int
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374                                      _vect_peel_extended_info *max)
1375 {
1376   vect_peel_info elem = *slot;
1377
1378   if (elem->count > max->peel_info.count
1379       || (elem->count == max->peel_info.count
1380           && max->peel_info.npeel > elem->npeel))
1381     {
1382       max->peel_info.npeel = elem->npeel;
1383       max->peel_info.count = elem->count;
1384       max->peel_info.dr = elem->dr;
1385     }
1386
1387   return 1;
1388 }
1389
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392    misalignment will be zero after peeling.  */
1393
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396                                 struct data_reference *dr0,
1397                                 unsigned int *inside_cost,
1398                                 unsigned int *outside_cost,
1399                                 stmt_vector_for_cost *body_cost_vec,
1400                                 stmt_vector_for_cost *prologue_cost_vec,
1401                                 unsigned int npeel,
1402                                 bool unknown_misalignment)
1403 {
1404   unsigned i;
1405   data_reference *dr;
1406
1407   FOR_EACH_VEC_ELT (datarefs, i, dr)
1408     {
1409       gimple *stmt = vect_dr_stmt (dr);
1410       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412         continue;
1413
1414       /* For interleaving, only the alignment of the first access
1415          matters.  */
1416       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418         continue;
1419
1420       /* Strided accesses perform only component accesses, alignment is
1421          irrelevant for them.  */
1422       if (STMT_VINFO_STRIDED_P (stmt_info)
1423           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1424         continue;
1425
1426       int save_misalignment;
1427       save_misalignment = DR_MISALIGNMENT (dr);
1428       if (npeel == 0)
1429         ;
1430       else if (unknown_misalignment && dr == dr0)
1431         SET_DR_MISALIGNMENT (dr, 0);
1432       else
1433         vect_update_misalignment_for_peel (dr, dr0, npeel);
1434       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1435                                  body_cost_vec, prologue_cost_vec);
1436       SET_DR_MISALIGNMENT (dr, save_misalignment);
1437     }
1438 }
1439
1440 /* Traverse peeling hash table and calculate cost for each peeling option.
1441    Find the one with the lowest cost.  */
1442
1443 int
1444 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1445                                    _vect_peel_extended_info *min)
1446 {
1447   vect_peel_info elem = *slot;
1448   int dummy;
1449   unsigned int inside_cost = 0, outside_cost = 0;
1450   gimple *stmt = vect_dr_stmt (elem->dr);
1451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1453   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1454                        epilogue_cost_vec;
1455
1456   prologue_cost_vec.create (2);
1457   body_cost_vec.create (2);
1458   epilogue_cost_vec.create (2);
1459
1460   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1461                                   elem->dr, &inside_cost, &outside_cost,
1462                                   &body_cost_vec, &prologue_cost_vec,
1463                                   elem->npeel, false);
1464
1465   body_cost_vec.release ();
1466
1467   outside_cost += vect_get_known_peeling_cost
1468     (loop_vinfo, elem->npeel, &dummy,
1469      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1470      &prologue_cost_vec, &epilogue_cost_vec);
1471
1472   /* Prologue and epilogue costs are added to the target model later.
1473      These costs depend only on the scalar iteration cost, the
1474      number of peeling iterations finally chosen, and the number of
1475      misaligned statements.  So discard the information found here.  */
1476   prologue_cost_vec.release ();
1477   epilogue_cost_vec.release ();
1478
1479   if (inside_cost < min->inside_cost
1480       || (inside_cost == min->inside_cost
1481           && outside_cost < min->outside_cost))
1482     {
1483       min->inside_cost = inside_cost;
1484       min->outside_cost = outside_cost;
1485       min->peel_info.dr = elem->dr;
1486       min->peel_info.npeel = elem->npeel;
1487       min->peel_info.count = elem->count;
1488     }
1489
1490   return 1;
1491 }
1492
1493
1494 /* Choose best peeling option by traversing peeling hash table and either
1495    choosing an option with the lowest cost (if cost model is enabled) or the
1496    option that aligns as many accesses as possible.  */
1497
1498 static struct _vect_peel_extended_info
1499 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1500                                        loop_vec_info loop_vinfo)
1501 {
1502    struct _vect_peel_extended_info res;
1503
1504    res.peel_info.dr = NULL;
1505
1506    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507      {
1508        res.inside_cost = INT_MAX;
1509        res.outside_cost = INT_MAX;
1510        peeling_htab->traverse <_vect_peel_extended_info *,
1511                                vect_peeling_hash_get_lowest_cost> (&res);
1512      }
1513    else
1514      {
1515        res.peel_info.count = 0;
1516        peeling_htab->traverse <_vect_peel_extended_info *,
1517                                vect_peeling_hash_get_most_frequent> (&res);
1518        res.inside_cost = 0;
1519        res.outside_cost = 0;
1520      }
1521
1522    return res;
1523 }
1524
1525 /* Return true if the new peeling NPEEL is supported.  */
1526
1527 static bool
1528 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1529                           unsigned npeel)
1530 {
1531   unsigned i;
1532   struct data_reference *dr = NULL;
1533   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534   gimple *stmt;
1535   stmt_vec_info stmt_info;
1536   enum dr_alignment_support supportable_dr_alignment;
1537
1538   /* Ensure that all data refs can be vectorized after the peel.  */
1539   FOR_EACH_VEC_ELT (datarefs, i, dr)
1540     {
1541       int save_misalignment;
1542
1543       if (dr == dr0)
1544         continue;
1545
1546       stmt = vect_dr_stmt (dr);
1547       stmt_info = vinfo_for_stmt (stmt);
1548       /* For interleaving, only the alignment of the first access
1549          matters.  */
1550       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1551           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1552         continue;
1553
1554       /* Strided accesses perform only component accesses, alignment is
1555          irrelevant for them.  */
1556       if (STMT_VINFO_STRIDED_P (stmt_info)
1557           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1558         continue;
1559
1560       save_misalignment = DR_MISALIGNMENT (dr);
1561       vect_update_misalignment_for_peel (dr, dr0, npeel);
1562       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1563       SET_DR_MISALIGNMENT (dr, save_misalignment);
1564
1565       if (!supportable_dr_alignment)
1566         return false;
1567     }
1568
1569   return true;
1570 }
1571
1572 /* Function vect_enhance_data_refs_alignment
1573
1574    This pass will use loop versioning and loop peeling in order to enhance
1575    the alignment of data references in the loop.
1576
1577    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1578    original loop is to be vectorized.  Any other loops that are created by
1579    the transformations performed in this pass - are not supposed to be
1580    vectorized.  This restriction will be relaxed.
1581
1582    This pass will require a cost model to guide it whether to apply peeling
1583    or versioning or a combination of the two.  For example, the scheme that
1584    intel uses when given a loop with several memory accesses, is as follows:
1585    choose one memory access ('p') which alignment you want to force by doing
1586    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1587    other accesses are not necessarily aligned, or (2) use loop versioning to
1588    generate one loop in which all accesses are aligned, and another loop in
1589    which only 'p' is necessarily aligned.
1590
1591    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1592    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1593    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1594
1595    Devising a cost model is the most critical aspect of this work.  It will
1596    guide us on which access to peel for, whether to use loop versioning, how
1597    many versions to create, etc.  The cost model will probably consist of
1598    generic considerations as well as target specific considerations (on
1599    powerpc for example, misaligned stores are more painful than misaligned
1600    loads).
1601
1602    Here are the general steps involved in alignment enhancements:
1603
1604      -- original loop, before alignment analysis:
1605         for (i=0; i<N; i++){
1606           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1607           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1608         }
1609
1610      -- After vect_compute_data_refs_alignment:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- Possibility 1: we do loop versioning:
1617      if (p is aligned) {
1618         for (i=0; i<N; i++){    # loop 1A
1619           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1620           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1621         }
1622      }
1623      else {
1624         for (i=0; i<N; i++){    # loop 1B
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1627         }
1628      }
1629
1630      -- Possibility 2: we do loop peeling:
1631      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1632         x = q[i];
1633         p[i] = y;
1634      }
1635      for (i = 3; i < N; i++){   # loop 2A
1636         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1637         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1638      }
1639
1640      -- Possibility 3: combination of loop peeling and versioning:
1641      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1642         x = q[i];
1643         p[i] = y;
1644      }
1645      if (p is aligned) {
1646         for (i = 3; i<N; i++){  # loop 3A
1647           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1648           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1649         }
1650      }
1651      else {
1652         for (i = 3; i<N; i++){  # loop 3B
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1655         }
1656      }
1657
1658      These loops are later passed to loop_transform to be vectorized.  The
1659      vectorizer will use the alignment information to guide the transformation
1660      (whether to generate regular loads/stores, or with special handling for
1661      misalignment).  */
1662
1663 bool
1664 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1665 {
1666   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1667   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669   struct data_reference *dr0 = NULL, *first_store = NULL;
1670   struct data_reference *dr;
1671   unsigned int i, j;
1672   bool do_peeling = false;
1673   bool do_versioning = false;
1674   bool stat;
1675   gimple *stmt;
1676   stmt_vec_info stmt_info;
1677   unsigned int npeel = 0;
1678   bool one_misalignment_known = false;
1679   bool one_misalignment_unknown = false;
1680   bool one_dr_unsupportable = false;
1681   struct data_reference *unsupportable_dr = NULL;
1682   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1683   unsigned possible_npeel_number = 1;
1684   tree vectype;
1685   unsigned int mis, same_align_drs_max = 0;
1686   hash_table<peel_info_hasher> peeling_htab (1);
1687
1688   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1689
1690   /* Reset data so we can safely be called multiple times.  */
1691   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1692   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1693
1694   /* While cost model enhancements are expected in the future, the high level
1695      view of the code at this time is as follows:
1696
1697      A) If there is a misaligned access then see if peeling to align
1698         this access can make all data references satisfy
1699         vect_supportable_dr_alignment.  If so, update data structures
1700         as needed and return true.
1701
1702      B) If peeling wasn't possible and there is a data reference with an
1703         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1704         then see if loop versioning checks can be used to make all data
1705         references satisfy vect_supportable_dr_alignment.  If so, update
1706         data structures as needed and return true.
1707
1708      C) If neither peeling nor versioning were successful then return false if
1709         any data reference does not satisfy vect_supportable_dr_alignment.
1710
1711      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1712
1713      Note, Possibility 3 above (which is peeling and versioning together) is not
1714      being done at this time.  */
1715
1716   /* (1) Peeling to force alignment.  */
1717
1718   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1719      Considerations:
1720      + How many accesses will become aligned due to the peeling
1721      - How many accesses will become unaligned due to the peeling,
1722        and the cost of misaligned accesses.
1723      - The cost of peeling (the extra runtime checks, the increase
1724        in code size).  */
1725
1726   FOR_EACH_VEC_ELT (datarefs, i, dr)
1727     {
1728       stmt = vect_dr_stmt (dr);
1729       stmt_info = vinfo_for_stmt (stmt);
1730
1731       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1732         continue;
1733
1734       /* For interleaving, only the alignment of the first access
1735          matters.  */
1736       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1737           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1738         continue;
1739
1740       /* For scatter-gather or invariant accesses there is nothing
1741          to enhance.  */
1742       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1743           || integer_zerop (DR_STEP (dr)))
1744         continue;
1745
1746       /* Strided accesses perform only component accesses, alignment is
1747          irrelevant for them.  */
1748       if (STMT_VINFO_STRIDED_P (stmt_info)
1749           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1750         continue;
1751
1752       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1753       do_peeling = vector_alignment_reachable_p (dr);
1754       if (do_peeling)
1755         {
1756           if (known_alignment_for_access_p (dr))
1757             {
1758               unsigned int npeel_tmp = 0;
1759               bool negative = tree_int_cst_compare (DR_STEP (dr),
1760                                                     size_zero_node) < 0;
1761
1762               vectype = STMT_VINFO_VECTYPE (stmt_info);
1763               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1764               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1765               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1766               if (DR_MISALIGNMENT (dr) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0 = dr;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   struct loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0 = dr;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr = dr;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (datarefs, dr0,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (datarefs, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0 = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS
1941         (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1942     }
1943
1944   peel_for_unknown_alignment.peel_info.npeel = 0;
1945   peel_for_unknown_alignment.peel_info.dr = dr0;
1946
1947   best_peel = peel_for_unknown_alignment;
1948
1949   peel_for_known_alignment.inside_cost = INT_MAX;
1950   peel_for_known_alignment.outside_cost = INT_MAX;
1951   peel_for_known_alignment.peel_info.count = 0;
1952   peel_for_known_alignment.peel_info.dr = NULL;
1953
1954   if (do_peeling && one_misalignment_known)
1955     {
1956       /* Peeling is possible, but there is no data access that is not supported
1957          unless aligned.  So we try to choose the best possible peeling from
1958          the hash table.  */
1959       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1960         (&peeling_htab, loop_vinfo);
1961     }
1962
1963   /* Compare costs of peeling for known and unknown alignment. */
1964   if (peel_for_known_alignment.peel_info.dr != NULL
1965       && peel_for_unknown_alignment.inside_cost
1966       >= peel_for_known_alignment.inside_cost)
1967     {
1968       best_peel = peel_for_known_alignment;
1969
1970       /* If the best peeling for known alignment has NPEEL == 0, perform no
1971          peeling at all except if there is an unsupportable dr that we can
1972          align.  */
1973       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1974         do_peeling = false;
1975     }
1976
1977   /* If there is an unsupportable data ref, prefer this over all choices so far
1978      since we'd have to discard a chosen peeling except when it accidentally
1979      aligned the unsupportable data ref.  */
1980   if (one_dr_unsupportable)
1981     dr0 = unsupportable_dr;
1982   else if (do_peeling)
1983     {
1984       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1985          TODO: Use nopeel_outside_cost or get rid of it?  */
1986       unsigned nopeel_inside_cost = 0;
1987       unsigned nopeel_outside_cost = 0;
1988
1989       stmt_vector_for_cost dummy;
1990       dummy.create (2);
1991       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1992                                       &nopeel_outside_cost, &dummy, &dummy,
1993                                       0, false);
1994       dummy.release ();
1995
1996       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1997          costs will be recorded.  */
1998       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1999       prologue_cost_vec.create (2);
2000       epilogue_cost_vec.create (2);
2001
2002       int dummy2;
2003       nopeel_outside_cost += vect_get_known_peeling_cost
2004         (loop_vinfo, 0, &dummy2,
2005          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006          &prologue_cost_vec, &epilogue_cost_vec);
2007
2008       prologue_cost_vec.release ();
2009       epilogue_cost_vec.release ();
2010
2011       npeel = best_peel.peel_info.npeel;
2012       dr0 = best_peel.peel_info.dr;
2013
2014       /* If no peeling is not more expensive than the best peeling we
2015          have so far, don't perform any peeling.  */
2016       if (nopeel_inside_cost <= best_peel.inside_cost)
2017         do_peeling = false;
2018     }
2019
2020   if (do_peeling)
2021     {
2022       stmt = vect_dr_stmt (dr0);
2023       stmt_info = vinfo_for_stmt (stmt);
2024       vectype = STMT_VINFO_VECTYPE (stmt_info);
2025
2026       if (known_alignment_for_access_p (dr0))
2027         {
2028           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2029                                                 size_zero_node) < 0;
2030           if (!npeel)
2031             {
2032               /* Since it's known at compile time, compute the number of
2033                  iterations in the peeled loop (the peeling factor) for use in
2034                  updating DR_MISALIGNMENT values.  The peeling factor is the
2035                  vectorization factor minus the misalignment as an element
2036                  count.  */
2037               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2038               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2039               npeel = ((mis & (target_align - 1))
2040                        / vect_get_scalar_dr_size (dr0));
2041             }
2042
2043           /* For interleaved data access every iteration accesses all the
2044              members of the group, therefore we divide the number of iterations
2045              by the group size.  */
2046           stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2047           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048             npeel /= DR_GROUP_SIZE (stmt_info);
2049
2050           if (dump_enabled_p ())
2051             dump_printf_loc (MSG_NOTE, vect_location,
2052                              "Try peeling by %d\n", npeel);
2053         }
2054
2055       /* Ensure that all datarefs can be vectorized after the peel.  */
2056       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2057         do_peeling = false;
2058
2059       /* Check if all datarefs are supportable and log.  */
2060       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2061         {
2062           stat = vect_verify_datarefs_alignment (loop_vinfo);
2063           if (!stat)
2064             do_peeling = false;
2065           else
2066             return stat;
2067         }
2068
2069       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2070       if (do_peeling)
2071         {
2072           unsigned max_allowed_peel
2073             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2074           if (max_allowed_peel != (unsigned)-1)
2075             {
2076               unsigned max_peel = npeel;
2077               if (max_peel == 0)
2078                 {
2079                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2080                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2081                 }
2082               if (max_peel > max_allowed_peel)
2083                 {
2084                   do_peeling = false;
2085                   if (dump_enabled_p ())
2086                     dump_printf_loc (MSG_NOTE, vect_location,
2087                         "Disable peeling, max peels reached: %d\n", max_peel);
2088                 }
2089             }
2090         }
2091
2092       /* Cost model #2 - if peeling may result in a remaining loop not
2093          iterating enough to be vectorized then do not peel.  Since this
2094          is a cost heuristic rather than a correctness decision, use the
2095          most likely runtime value for variable vectorization factors.  */
2096       if (do_peeling
2097           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2098         {
2099           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2100           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2101           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2102               < assumed_vf + max_peel)
2103             do_peeling = false;
2104         }
2105
2106       if (do_peeling)
2107         {
2108           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2109              If the misalignment of DR_i is identical to that of dr0 then set
2110              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2111              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2112              by the peeling factor times the element size of DR_i (MOD the
2113              vectorization factor times the size).  Otherwise, the
2114              misalignment of DR_i must be set to unknown.  */
2115           FOR_EACH_VEC_ELT (datarefs, i, dr)
2116             if (dr != dr0)
2117               {
2118                 /* Strided accesses perform only component accesses, alignment
2119                    is irrelevant for them.  */
2120                 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2121                 if (STMT_VINFO_STRIDED_P (stmt_info)
2122                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2123                   continue;
2124
2125                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2126               }
2127
2128           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2129           if (npeel)
2130             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2131           else
2132             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2133               = DR_MISALIGNMENT (dr0);
2134           SET_DR_MISALIGNMENT (dr0, 0);
2135           if (dump_enabled_p ())
2136             {
2137               dump_printf_loc (MSG_NOTE, vect_location,
2138                                "Alignment of access forced using peeling.\n");
2139               dump_printf_loc (MSG_NOTE, vect_location,
2140                                "Peeling for alignment will be applied.\n");
2141             }
2142
2143           /* The inside-loop cost will be accounted for in vectorizable_load
2144              and vectorizable_store correctly with adjusted alignments.
2145              Drop the body_cst_vec on the floor here.  */
2146           stat = vect_verify_datarefs_alignment (loop_vinfo);
2147           gcc_assert (stat);
2148           return stat;
2149         }
2150     }
2151
2152   /* (2) Versioning to force alignment.  */
2153
2154   /* Try versioning if:
2155      1) optimize loop for speed
2156      2) there is at least one unsupported misaligned data ref with an unknown
2157         misalignment, and
2158      3) all misaligned data refs with a known misalignment are supported, and
2159      4) the number of runtime alignment checks is within reason.  */
2160
2161   do_versioning =
2162         optimize_loop_nest_for_speed_p (loop)
2163         && (!loop->inner); /* FORNOW */
2164
2165   if (do_versioning)
2166     {
2167       FOR_EACH_VEC_ELT (datarefs, i, dr)
2168         {
2169           stmt = vect_dr_stmt (dr);
2170           stmt_info = vinfo_for_stmt (stmt);
2171
2172           /* For interleaving, only the alignment of the first access
2173              matters.  */
2174           if (aligned_access_p (dr)
2175               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2176                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2177             continue;
2178
2179           if (STMT_VINFO_STRIDED_P (stmt_info))
2180             {
2181               /* Strided loads perform only component accesses, alignment is
2182                  irrelevant for them.  */
2183               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184                 continue;
2185               do_versioning = false;
2186               break;
2187             }
2188
2189           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2190
2191           if (!supportable_dr_alignment)
2192             {
2193               gimple *stmt;
2194               int mask;
2195               tree vectype;
2196
2197               if (known_alignment_for_access_p (dr)
2198                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2199                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2200                 {
2201                   do_versioning = false;
2202                   break;
2203                 }
2204
2205               stmt = vect_dr_stmt (dr);
2206               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2207               gcc_assert (vectype);
2208
2209               /* At present we don't support versioning for alignment
2210                  with variable VF, since there's no guarantee that the
2211                  VF is a power of two.  We could relax this if we added
2212                  a way of enforcing a power-of-two size.  */
2213               unsigned HOST_WIDE_INT size;
2214               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2215                 {
2216                   do_versioning = false;
2217                   break;
2218                 }
2219
2220               /* The rightmost bits of an aligned address must be zeros.
2221                  Construct the mask needed for this test.  For example,
2222                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2223                  mask must be 15 = 0xf. */
2224               mask = size - 1;
2225
2226               /* FORNOW: use the same mask to test all potentially unaligned
2227                  references in the loop.  The vectorizer currently supports
2228                  a single vector size, see the reference to
2229                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2230                  vectorization factor is computed.  */
2231               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2232                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2233               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2234               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2235                       vect_dr_stmt (dr));
2236             }
2237         }
2238
2239       /* Versioning requires at least one misaligned data reference.  */
2240       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2241         do_versioning = false;
2242       else if (!do_versioning)
2243         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2244     }
2245
2246   if (do_versioning)
2247     {
2248       vec<gimple *> may_misalign_stmts
2249         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2250       gimple *stmt;
2251
2252       /* It can now be assumed that the data references in the statements
2253          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2254          of the loop being vectorized.  */
2255       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2256         {
2257           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258           dr = STMT_VINFO_DATA_REF (stmt_info);
2259           SET_DR_MISALIGNMENT (dr, 0);
2260           if (dump_enabled_p ())
2261             dump_printf_loc (MSG_NOTE, vect_location,
2262                              "Alignment of access forced using versioning.\n");
2263         }
2264
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_NOTE, vect_location,
2267                          "Versioning for alignment will be applied.\n");
2268
2269       /* Peeling and versioning can't be done together at this time.  */
2270       gcc_assert (! (do_peeling && do_versioning));
2271
2272       stat = vect_verify_datarefs_alignment (loop_vinfo);
2273       gcc_assert (stat);
2274       return stat;
2275     }
2276
2277   /* This point is reached if neither peeling nor versioning is being done.  */
2278   gcc_assert (! (do_peeling || do_versioning));
2279
2280   stat = vect_verify_datarefs_alignment (loop_vinfo);
2281   return stat;
2282 }
2283
2284
2285 /* Function vect_find_same_alignment_drs.
2286
2287    Update group and alignment relations according to the chosen
2288    vectorization factor.  */
2289
2290 static void
2291 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2292 {
2293   struct data_reference *dra = DDR_A (ddr);
2294   struct data_reference *drb = DDR_B (ddr);
2295   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2296   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2297
2298   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2299     return;
2300
2301   if (dra == drb)
2302     return;
2303
2304   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2305       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2306     return;
2307
2308   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2309       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2310       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2311     return;
2312
2313   /* Two references with distance zero have the same alignment.  */
2314   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2315                           - wi::to_poly_offset (DR_INIT (drb)));
2316   if (maybe_ne (diff, 0))
2317     {
2318       /* Get the wider of the two alignments.  */
2319       unsigned int align_a = (vect_calculate_target_alignment (dra)
2320                               / BITS_PER_UNIT);
2321       unsigned int align_b = (vect_calculate_target_alignment (drb)
2322                               / BITS_PER_UNIT);
2323       unsigned int max_align = MAX (align_a, align_b);
2324
2325       /* Require the gap to be a multiple of the larger vector alignment.  */
2326       if (!multiple_p (diff, max_align))
2327         return;
2328     }
2329
2330   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2331   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2332   if (dump_enabled_p ())
2333     {
2334       dump_printf_loc (MSG_NOTE, vect_location,
2335                        "accesses have the same alignment: ");
2336       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2337       dump_printf (MSG_NOTE,  " and ");
2338       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2339       dump_printf (MSG_NOTE, "\n");
2340     }
2341 }
2342
2343
2344 /* Function vect_analyze_data_refs_alignment
2345
2346    Analyze the alignment of the data-references in the loop.
2347    Return FALSE if a data reference is found that cannot be vectorized.  */
2348
2349 bool
2350 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2351 {
2352   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2353
2354   /* Mark groups of data references with same alignment using
2355      data dependence information.  */
2356   vec<ddr_p> ddrs = vinfo->ddrs;
2357   struct data_dependence_relation *ddr;
2358   unsigned int i;
2359
2360   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2361     vect_find_same_alignment_drs (ddr);
2362
2363   vec<data_reference_p> datarefs = vinfo->datarefs;
2364   struct data_reference *dr;
2365
2366   vect_record_base_alignments (vinfo);
2367   FOR_EACH_VEC_ELT (datarefs, i, dr)
2368     {
2369       stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2370       if (STMT_VINFO_VECTORIZABLE (stmt_info))
2371         vect_compute_data_ref_alignment (dr);
2372     }
2373
2374   return true;
2375 }
2376
2377
2378 /* Analyze alignment of DRs of stmts in NODE.  */
2379
2380 static bool
2381 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2382 {
2383   /* We vectorize from the first scalar stmt in the node unless
2384      the node is permuted in which case we start from the first
2385      element in the group.  */
2386   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2387   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2388   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2389     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2390
2391   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392   vect_compute_data_ref_alignment (dr);
2393   /* For creating the data-ref pointer we need alignment of the
2394      first element anyway.  */
2395   if (dr != first_dr)
2396     vect_compute_data_ref_alignment (first_dr);
2397   if (! verify_data_ref_alignment (dr))
2398     {
2399       if (dump_enabled_p ())
2400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                          "not vectorized: bad data alignment in basic "
2402                          "block.\n");
2403       return false;
2404     }
2405
2406   return true;
2407 }
2408
2409 /* Function vect_slp_analyze_instance_alignment
2410
2411    Analyze the alignment of the data-references in the SLP instance.
2412    Return FALSE if a data reference is found that cannot be vectorized.  */
2413
2414 bool
2415 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2416 {
2417   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2418
2419   slp_tree node;
2420   unsigned i;
2421   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2422     if (! vect_slp_analyze_and_verify_node_alignment (node))
2423       return false;
2424
2425   node = SLP_INSTANCE_TREE (instance);
2426   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2427       && ! vect_slp_analyze_and_verify_node_alignment
2428              (SLP_INSTANCE_TREE (instance)))
2429     return false;
2430
2431   return true;
2432 }
2433
2434
2435 /* Analyze groups of accesses: check that DR belongs to a group of
2436    accesses of legal size, step, etc.  Detect gaps, single element
2437    interleaving, and other special cases. Set grouped access info.
2438    Collect groups of strided stores for further use in SLP analysis.
2439    Worker for vect_analyze_group_access.  */
2440
2441 static bool
2442 vect_analyze_group_access_1 (struct data_reference *dr)
2443 {
2444   tree step = DR_STEP (dr);
2445   tree scalar_type = TREE_TYPE (DR_REF (dr));
2446   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2447   gimple *stmt = vect_dr_stmt (dr);
2448   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2450   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2451   HOST_WIDE_INT dr_step = -1;
2452   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2453   bool slp_impossible = false;
2454
2455   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2456      size of the interleaving group (including gaps).  */
2457   if (tree_fits_shwi_p (step))
2458     {
2459       dr_step = tree_to_shwi (step);
2460       /* Check that STEP is a multiple of type size.  Otherwise there is
2461          a non-element-sized gap at the end of the group which we
2462          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2463          ???  As we can handle non-constant step fine here we should
2464          simply remove uses of DR_GROUP_GAP between the last and first
2465          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2466          simply not include that gap.  */
2467       if ((dr_step % type_size) != 0)
2468         {
2469           if (dump_enabled_p ())
2470             {
2471               dump_printf_loc (MSG_NOTE, vect_location,
2472                                "Step ");
2473               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2474               dump_printf (MSG_NOTE,
2475                            " is not a multiple of the element size for ");
2476               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2477               dump_printf (MSG_NOTE, "\n");
2478             }
2479           return false;
2480         }
2481       groupsize = absu_hwi (dr_step) / type_size;
2482     }
2483   else
2484     groupsize = 0;
2485
2486   /* Not consecutive access is possible only if it is a part of interleaving.  */
2487   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2488     {
2489       /* Check if it this DR is a part of interleaving, and is a single
2490          element of the group that is accessed in the loop.  */
2491
2492       /* Gaps are supported only for loads. STEP must be a multiple of the type
2493          size.  */
2494       if (DR_IS_READ (dr)
2495           && (dr_step % type_size) == 0
2496           && groupsize > 0)
2497         {
2498           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2499           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2500           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2501           if (dump_enabled_p ())
2502             {
2503               dump_printf_loc (MSG_NOTE, vect_location,
2504                                "Detected single element interleaving ");
2505               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2506               dump_printf (MSG_NOTE, " step ");
2507               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2508               dump_printf (MSG_NOTE, "\n");
2509             }
2510
2511           return true;
2512         }
2513
2514       if (dump_enabled_p ())
2515         {
2516           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517                            "not consecutive access ");
2518           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2519         }
2520
2521       if (bb_vinfo)
2522         {
2523           /* Mark the statement as unvectorizable.  */
2524           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2525           return true;
2526         }
2527
2528       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2529       STMT_VINFO_STRIDED_P (stmt_info) = true;
2530       return true;
2531     }
2532
2533   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2534     {
2535       /* First stmt in the interleaving chain. Check the chain.  */
2536       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2537       struct data_reference *data_ref = dr;
2538       unsigned int count = 1;
2539       tree prev_init = DR_INIT (data_ref);
2540       gimple *prev = stmt;
2541       HOST_WIDE_INT diff, gaps = 0;
2542
2543       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2544       while (next)
2545         {
2546           /* Skip same data-refs.  In case that two or more stmts share
2547              data-ref (supported only for loads), we vectorize only the first
2548              stmt, and the rest get their vectorized loads from the first
2549              one.  */
2550           if (!tree_int_cst_compare (DR_INIT (data_ref),
2551                                      DR_INIT (STMT_VINFO_DATA_REF (
2552                                                    vinfo_for_stmt (next)))))
2553             {
2554               if (DR_IS_WRITE (data_ref))
2555                 {
2556                   if (dump_enabled_p ())
2557                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558                                      "Two store stmts share the same dr.\n");
2559                   return false;
2560                 }
2561
2562               if (dump_enabled_p ())
2563                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564                                  "Two or more load stmts share the same dr.\n");
2565
2566               /* For load use the same data-ref load.  */
2567               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2568
2569               prev = next;
2570               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2571               continue;
2572             }
2573
2574           prev = next;
2575           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2576
2577           /* All group members have the same STEP by construction.  */
2578           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2579
2580           /* Check that the distance between two accesses is equal to the type
2581              size. Otherwise, we have gaps.  */
2582           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2583                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2584           if (diff != 1)
2585             {
2586               /* FORNOW: SLP of accesses with gaps is not supported.  */
2587               slp_impossible = true;
2588               if (DR_IS_WRITE (data_ref))
2589                 {
2590                   if (dump_enabled_p ())
2591                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2592                                      "interleaved store with gaps\n");
2593                   return false;
2594                 }
2595
2596               gaps += diff - 1;
2597             }
2598
2599           last_accessed_element += diff;
2600
2601           /* Store the gap from the previous member of the group. If there is no
2602              gap in the access, DR_GROUP_GAP is always 1.  */
2603           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2604
2605           prev_init = DR_INIT (data_ref);
2606           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2607           /* Count the number of data-refs in the chain.  */
2608           count++;
2609         }
2610
2611       if (groupsize == 0)
2612         groupsize = count + gaps;
2613
2614       /* This could be UINT_MAX but as we are generating code in a very
2615          inefficient way we have to cap earlier.  See PR78699 for example.  */
2616       if (groupsize > 4096)
2617         {
2618           if (dump_enabled_p ())
2619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2620                              "group is too large\n");
2621           return false;
2622         }
2623
2624       /* Check that the size of the interleaving is equal to count for stores,
2625          i.e., that there are no gaps.  */
2626       if (groupsize != count
2627           && !DR_IS_READ (dr))
2628         {
2629           if (dump_enabled_p ())
2630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                              "interleaved store with gaps\n");
2632           return false;
2633         }
2634
2635       /* If there is a gap after the last load in the group it is the
2636          difference between the groupsize and the last accessed
2637          element.
2638          When there is no gap, this difference should be 0.  */
2639       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2640
2641       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2642       if (dump_enabled_p ())
2643         {
2644           dump_printf_loc (MSG_NOTE, vect_location,
2645                            "Detected interleaving ");
2646           if (DR_IS_READ (dr))
2647             dump_printf (MSG_NOTE, "load ");
2648           else
2649             dump_printf (MSG_NOTE, "store ");
2650           dump_printf (MSG_NOTE, "of size %u starting with ",
2651                        (unsigned)groupsize);
2652           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2653           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2654             dump_printf_loc (MSG_NOTE, vect_location,
2655                              "There is a gap of %u elements after the group\n",
2656                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2657         }
2658
2659       /* SLP: create an SLP data structure for every interleaving group of
2660          stores for further analysis in vect_analyse_slp.  */
2661       if (DR_IS_WRITE (dr) && !slp_impossible)
2662         {
2663           if (loop_vinfo)
2664             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2665           if (bb_vinfo)
2666             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2667         }
2668     }
2669
2670   return true;
2671 }
2672
2673 /* Analyze groups of accesses: check that DR belongs to a group of
2674    accesses of legal size, step, etc.  Detect gaps, single element
2675    interleaving, and other special cases. Set grouped access info.
2676    Collect groups of strided stores for further use in SLP analysis.  */
2677
2678 static bool
2679 vect_analyze_group_access (struct data_reference *dr)
2680 {
2681   if (!vect_analyze_group_access_1 (dr))
2682     {
2683       /* Dissolve the group if present.  */
2684       gimple *next;
2685       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2686       while (stmt)
2687         {
2688           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2689           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2690           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2691           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2692           stmt = next;
2693         }
2694       return false;
2695     }
2696   return true;
2697 }
2698
2699 /* Analyze the access pattern of the data-reference DR.
2700    In case of non-consecutive accesses call vect_analyze_group_access() to
2701    analyze groups of accesses.  */
2702
2703 static bool
2704 vect_analyze_data_ref_access (struct data_reference *dr)
2705 {
2706   tree step = DR_STEP (dr);
2707   tree scalar_type = TREE_TYPE (DR_REF (dr));
2708   gimple *stmt = vect_dr_stmt (dr);
2709   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2710   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2711   struct loop *loop = NULL;
2712
2713   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2714     return true;
2715
2716   if (loop_vinfo)
2717     loop = LOOP_VINFO_LOOP (loop_vinfo);
2718
2719   if (loop_vinfo && !step)
2720     {
2721       if (dump_enabled_p ())
2722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2723                          "bad data-ref access in loop\n");
2724       return false;
2725     }
2726
2727   /* Allow loads with zero step in inner-loop vectorization.  */
2728   if (loop_vinfo && integer_zerop (step))
2729     {
2730       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2731       if (!nested_in_vect_loop_p (loop, stmt))
2732         return DR_IS_READ (dr);
2733       /* Allow references with zero step for outer loops marked
2734          with pragma omp simd only - it guarantees absence of
2735          loop-carried dependencies between inner loop iterations.  */
2736       if (loop->safelen < 2)
2737         {
2738           if (dump_enabled_p ())
2739             dump_printf_loc (MSG_NOTE, vect_location,
2740                              "zero step in inner loop of nest\n");
2741           return false;
2742         }
2743     }
2744
2745   if (loop && nested_in_vect_loop_p (loop, stmt))
2746     {
2747       /* Interleaved accesses are not yet supported within outer-loop
2748         vectorization for references in the inner-loop.  */
2749       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2750
2751       /* For the rest of the analysis we use the outer-loop step.  */
2752       step = STMT_VINFO_DR_STEP (stmt_info);
2753       if (integer_zerop (step))
2754         {
2755           if (dump_enabled_p ())
2756             dump_printf_loc (MSG_NOTE, vect_location,
2757                              "zero step in outer loop.\n");
2758           return DR_IS_READ (dr);
2759         }
2760     }
2761
2762   /* Consecutive?  */
2763   if (TREE_CODE (step) == INTEGER_CST)
2764     {
2765       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2766       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2767           || (dr_step < 0
2768               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2769         {
2770           /* Mark that it is not interleaving.  */
2771           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2772           return true;
2773         }
2774     }
2775
2776   if (loop && nested_in_vect_loop_p (loop, stmt))
2777     {
2778       if (dump_enabled_p ())
2779         dump_printf_loc (MSG_NOTE, vect_location,
2780                          "grouped access in outer loop.\n");
2781       return false;
2782     }
2783
2784
2785   /* Assume this is a DR handled by non-constant strided load case.  */
2786   if (TREE_CODE (step) != INTEGER_CST)
2787     return (STMT_VINFO_STRIDED_P (stmt_info)
2788             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2789                 || vect_analyze_group_access (dr)));
2790
2791   /* Not consecutive access - check if it's a part of interleaving group.  */
2792   return vect_analyze_group_access (dr);
2793 }
2794
2795 /* Compare two data-references DRA and DRB to group them into chunks
2796    suitable for grouping.  */
2797
2798 static int
2799 dr_group_sort_cmp (const void *dra_, const void *drb_)
2800 {
2801   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2802   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2803   int cmp;
2804
2805   /* Stabilize sort.  */
2806   if (dra == drb)
2807     return 0;
2808
2809   /* DRs in different loops never belong to the same group.  */
2810   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2811   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2812   if (loopa != loopb)
2813     return loopa->num < loopb->num ? -1 : 1;
2814
2815   /* Ordering of DRs according to base.  */
2816   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2817                                DR_BASE_ADDRESS (drb));
2818   if (cmp != 0)
2819     return cmp;
2820
2821   /* And according to DR_OFFSET.  */
2822   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2823   if (cmp != 0)
2824     return cmp;
2825
2826   /* Put reads before writes.  */
2827   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2828     return DR_IS_READ (dra) ? -1 : 1;
2829
2830   /* Then sort after access size.  */
2831   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2832                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2833   if (cmp != 0)
2834     return cmp;
2835
2836   /* And after step.  */
2837   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2838   if (cmp != 0)
2839     return cmp;
2840
2841   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2842   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2843   if (cmp == 0)
2844     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2845   return cmp;
2846 }
2847
2848 /* If OP is the result of a conversion, return the unconverted value,
2849    otherwise return null.  */
2850
2851 static tree
2852 strip_conversion (tree op)
2853 {
2854   if (TREE_CODE (op) != SSA_NAME)
2855     return NULL_TREE;
2856   gimple *stmt = SSA_NAME_DEF_STMT (op);
2857   if (!is_gimple_assign (stmt)
2858       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2859     return NULL_TREE;
2860   return gimple_assign_rhs1 (stmt);
2861 }
2862
2863 /* Return true if vectorizable_* routines can handle statements STMT1
2864    and STMT2 being in a single group.  */
2865
2866 static bool
2867 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2868 {
2869   if (gimple_assign_single_p (stmt1))
2870     return gimple_assign_single_p (stmt2);
2871
2872   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2873     {
2874       /* Check for two masked loads or two masked stores.  */
2875       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2876         return false;
2877       internal_fn ifn = gimple_call_internal_fn (stmt1);
2878       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2879         return false;
2880       if (ifn != gimple_call_internal_fn (stmt2))
2881         return false;
2882
2883       /* Check that the masks are the same.  Cope with casts of masks,
2884          like those created by build_mask_conversion.  */
2885       tree mask1 = gimple_call_arg (stmt1, 2);
2886       tree mask2 = gimple_call_arg (stmt2, 2);
2887       if (!operand_equal_p (mask1, mask2, 0))
2888         {
2889           mask1 = strip_conversion (mask1);
2890           if (!mask1)
2891             return false;
2892           mask2 = strip_conversion (mask2);
2893           if (!mask2)
2894             return false;
2895           if (!operand_equal_p (mask1, mask2, 0))
2896             return false;
2897         }
2898       return true;
2899     }
2900
2901   return false;
2902 }
2903
2904 /* Function vect_analyze_data_ref_accesses.
2905
2906    Analyze the access pattern of all the data references in the loop.
2907
2908    FORNOW: the only access pattern that is considered vectorizable is a
2909            simple step 1 (consecutive) access.
2910
2911    FORNOW: handle only arrays and pointer accesses.  */
2912
2913 bool
2914 vect_analyze_data_ref_accesses (vec_info *vinfo)
2915 {
2916   unsigned int i;
2917   vec<data_reference_p> datarefs = vinfo->datarefs;
2918   struct data_reference *dr;
2919
2920   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2921
2922   if (datarefs.is_empty ())
2923     return true;
2924
2925   /* Sort the array of datarefs to make building the interleaving chains
2926      linear.  Don't modify the original vector's order, it is needed for
2927      determining what dependencies are reversed.  */
2928   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2929   datarefs_copy.qsort (dr_group_sort_cmp);
2930
2931   /* Build the interleaving chains.  */
2932   for (i = 0; i < datarefs_copy.length () - 1;)
2933     {
2934       data_reference_p dra = datarefs_copy[i];
2935       stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2936       stmt_vec_info lastinfo = NULL;
2937       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2938           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2939         {
2940           ++i;
2941           continue;
2942         }
2943       for (i = i + 1; i < datarefs_copy.length (); ++i)
2944         {
2945           data_reference_p drb = datarefs_copy[i];
2946           stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2947           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2948               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2949             break;
2950
2951           /* ???  Imperfect sorting (non-compatible types, non-modulo
2952              accesses, same accesses) can lead to a group to be artificially
2953              split here as we don't just skip over those.  If it really
2954              matters we can push those to a worklist and re-iterate
2955              over them.  The we can just skip ahead to the next DR here.  */
2956
2957           /* DRs in a different loop should not be put into the same
2958              interleaving group.  */
2959           if (gimple_bb (DR_STMT (dra))->loop_father
2960               != gimple_bb (DR_STMT (drb))->loop_father)
2961             break;
2962
2963           /* Check that the data-refs have same first location (except init)
2964              and they are both either store or load (not load and store,
2965              not masked loads or stores).  */
2966           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2967               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2968                                         DR_BASE_ADDRESS (drb)) != 0
2969               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2970               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2971             break;
2972
2973           /* Check that the data-refs have the same constant size.  */
2974           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2975           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2976           if (!tree_fits_uhwi_p (sza)
2977               || !tree_fits_uhwi_p (szb)
2978               || !tree_int_cst_equal (sza, szb))
2979             break;
2980
2981           /* Check that the data-refs have the same step.  */
2982           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2983             break;
2984
2985           /* Check the types are compatible.
2986              ???  We don't distinguish this during sorting.  */
2987           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2988                                    TREE_TYPE (DR_REF (drb))))
2989             break;
2990
2991           /* Check that the DR_INITs are compile-time constants.  */
2992           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2993               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2994             break;
2995
2996           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2997           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2998           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2999           HOST_WIDE_INT init_prev
3000             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3001           gcc_assert (init_a <= init_b
3002                       && init_a <= init_prev
3003                       && init_prev <= init_b);
3004
3005           /* Do not place the same access in the interleaving chain twice.  */
3006           if (init_b == init_prev)
3007             {
3008               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3009                           < gimple_uid (DR_STMT (drb)));
3010               /* ???  For now we simply "drop" the later reference which is
3011                  otherwise the same rather than finishing off this group.
3012                  In the end we'd want to re-process duplicates forming
3013                  multiple groups from the refs, likely by just collecting
3014                  all candidates (including duplicates and split points
3015                  below) in a vector and then process them together.  */
3016               continue;
3017             }
3018
3019           /* If init_b == init_a + the size of the type * k, we have an
3020              interleaving, and DRA is accessed before DRB.  */
3021           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3022           if (type_size_a == 0
3023               || (init_b - init_a) % type_size_a != 0)
3024             break;
3025
3026           /* If we have a store, the accesses are adjacent.  This splits
3027              groups into chunks we support (we don't support vectorization
3028              of stores with gaps).  */
3029           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3030             break;
3031
3032           /* If the step (if not zero or non-constant) is greater than the
3033              difference between data-refs' inits this splits groups into
3034              suitable sizes.  */
3035           if (tree_fits_shwi_p (DR_STEP (dra)))
3036             {
3037               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3038               if (step != 0 && step <= (init_b - init_a))
3039                 break;
3040             }
3041
3042           if (dump_enabled_p ())
3043             {
3044               dump_printf_loc (MSG_NOTE, vect_location,
3045                                "Detected interleaving ");
3046               if (DR_IS_READ (dra))
3047                 dump_printf (MSG_NOTE, "load ");
3048               else
3049                 dump_printf (MSG_NOTE, "store ");
3050               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3051               dump_printf (MSG_NOTE,  " and ");
3052               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3053               dump_printf (MSG_NOTE, "\n");
3054             }
3055
3056           /* Link the found element into the group list.  */
3057           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3058             {
3059               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3060               lastinfo = stmtinfo_a;
3061             }
3062           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3063           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3064           lastinfo = stmtinfo_b;
3065         }
3066     }
3067
3068   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3069     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3070         && !vect_analyze_data_ref_access (dr))
3071       {
3072         if (dump_enabled_p ())
3073           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074                            "not vectorized: complicated access pattern.\n");
3075
3076         if (is_a <bb_vec_info> (vinfo))
3077           {
3078             /* Mark the statement as not vectorizable.  */
3079             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3080             continue;
3081           }
3082         else
3083           {
3084             datarefs_copy.release ();
3085             return false;
3086           }
3087       }
3088
3089   datarefs_copy.release ();
3090   return true;
3091 }
3092
3093 /* Function vect_vfa_segment_size.
3094
3095    Input:
3096      DR: The data reference.
3097      LENGTH_FACTOR: segment length to consider.
3098
3099    Return a value suitable for the dr_with_seg_len::seg_len field.
3100    This is the "distance travelled" by the pointer from the first
3101    iteration in the segment to the last.  Note that it does not include
3102    the size of the access; in effect it only describes the first byte.  */
3103
3104 static tree
3105 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3106 {
3107   length_factor = size_binop (MINUS_EXPR,
3108                               fold_convert (sizetype, length_factor),
3109                               size_one_node);
3110   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3111                      length_factor);
3112 }
3113
3114 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3115    gives the worst-case number of bytes covered by the segment.  */
3116
3117 static unsigned HOST_WIDE_INT
3118 vect_vfa_access_size (data_reference *dr)
3119 {
3120   stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3121   tree ref_type = TREE_TYPE (DR_REF (dr));
3122   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3123   unsigned HOST_WIDE_INT access_size = ref_size;
3124   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3125     {
3126       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3127       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3128     }
3129   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3130       && (vect_supportable_dr_alignment (dr, false)
3131           == dr_explicit_realign_optimized))
3132     {
3133       /* We might access a full vector's worth.  */
3134       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3135       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3136     }
3137   return access_size;
3138 }
3139
3140 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3141
3142 static unsigned int
3143 vect_vfa_align (const data_reference *dr)
3144 {
3145   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3146 }
3147
3148 /* Function vect_no_alias_p.
3149
3150    Given data references A and B with equal base and offset, see whether
3151    the alias relation can be decided at compilation time.  Return 1 if
3152    it can and the references alias, 0 if it can and the references do
3153    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3154    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3155    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3156
3157 static int
3158 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3159                          tree segment_length_a, tree segment_length_b,
3160                          unsigned HOST_WIDE_INT access_size_a,
3161                          unsigned HOST_WIDE_INT access_size_b)
3162 {
3163   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3164   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3165   poly_uint64 const_length_a;
3166   poly_uint64 const_length_b;
3167
3168   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3169      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3170      [a, a+12) */
3171   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3172     {
3173       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3174       offset_a = (offset_a + access_size_a) - const_length_a;
3175     }
3176   else
3177     const_length_a = tree_to_poly_uint64 (segment_length_a);
3178   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3179     {
3180       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3181       offset_b = (offset_b + access_size_b) - const_length_b;
3182     }
3183   else
3184     const_length_b = tree_to_poly_uint64 (segment_length_b);
3185
3186   const_length_a += access_size_a;
3187   const_length_b += access_size_b;
3188
3189   if (ranges_known_overlap_p (offset_a, const_length_a,
3190                               offset_b, const_length_b))
3191     return 1;
3192
3193   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3194                                offset_b, const_length_b))
3195     return 0;
3196
3197   return -1;
3198 }
3199
3200 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3201    in DDR is >= VF.  */
3202
3203 static bool
3204 dependence_distance_ge_vf (data_dependence_relation *ddr,
3205                            unsigned int loop_depth, poly_uint64 vf)
3206 {
3207   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3208       || DDR_NUM_DIST_VECTS (ddr) == 0)
3209     return false;
3210
3211   /* If the dependence is exact, we should have limited the VF instead.  */
3212   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3213
3214   unsigned int i;
3215   lambda_vector dist_v;
3216   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3217     {
3218       HOST_WIDE_INT dist = dist_v[loop_depth];
3219       if (dist != 0
3220           && !(dist > 0 && DDR_REVERSED_P (ddr))
3221           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3222         return false;
3223     }
3224
3225   if (dump_enabled_p ())
3226     {
3227       dump_printf_loc (MSG_NOTE, vect_location,
3228                        "dependence distance between ");
3229       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3230       dump_printf (MSG_NOTE,  " and ");
3231       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3232       dump_printf (MSG_NOTE,  " is >= VF\n");
3233     }
3234
3235   return true;
3236 }
3237
3238 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3239
3240 static void
3241 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3242 {
3243   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3244   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3245   dump_printf (dump_kind, ") >= ");
3246   dump_dec (dump_kind, lower_bound.min_value);
3247 }
3248
3249 /* Record that the vectorized loop requires the vec_lower_bound described
3250    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3251
3252 static void
3253 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3254                         poly_uint64 min_value)
3255 {
3256   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3257   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3258     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3259       {
3260         unsigned_p &= lower_bounds[i].unsigned_p;
3261         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3262         if (lower_bounds[i].unsigned_p != unsigned_p
3263             || maybe_lt (lower_bounds[i].min_value, min_value))
3264           {
3265             lower_bounds[i].unsigned_p = unsigned_p;
3266             lower_bounds[i].min_value = min_value;
3267             if (dump_enabled_p ())
3268               {
3269                 dump_printf_loc (MSG_NOTE, vect_location,
3270                                  "updating run-time check to ");
3271                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3272                 dump_printf (MSG_NOTE, "\n");
3273               }
3274           }
3275         return;
3276       }
3277
3278   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3279   if (dump_enabled_p ())
3280     {
3281       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3282       dump_lower_bound (MSG_NOTE, lower_bound);
3283       dump_printf (MSG_NOTE, "\n");
3284     }
3285   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3286 }
3287
3288 /* Return true if it's unlikely that the step of the vectorized form of DR
3289    will span fewer than GAP bytes.  */
3290
3291 static bool
3292 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3293 {
3294   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3295   HOST_WIDE_INT count
3296     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3297   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3298     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3299   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3300 }
3301
3302 /* Return true if we know that there is no alias between DR_A and DR_B
3303    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3304    *LOWER_BOUND_OUT to this N.  */
3305
3306 static bool
3307 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3308                                 poly_uint64 *lower_bound_out)
3309 {
3310   /* Check that there is a constant gap of known sign between DR_A
3311      and DR_B.  */
3312   poly_int64 init_a, init_b;
3313   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3314       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3315       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3316       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3317       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3318       || !ordered_p (init_a, init_b))
3319     return false;
3320
3321   /* Sort DR_A and DR_B by the address they access.  */
3322   if (maybe_lt (init_b, init_a))
3323     {
3324       std::swap (init_a, init_b);
3325       std::swap (dr_a, dr_b);
3326     }
3327
3328   /* If the two accesses could be dependent within a scalar iteration,
3329      make sure that we'd retain their order.  */
3330   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3331       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3332                                          vect_dr_stmt (dr_b)))
3333     return false;
3334
3335   /* There is no alias if abs (DR_STEP) is greater than or equal to
3336      the bytes spanned by the combination of the two accesses.  */
3337   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3338   return true;
3339 }
3340
3341 /* Function vect_prune_runtime_alias_test_list.
3342
3343    Prune a list of ddrs to be tested at run-time by versioning for alias.
3344    Merge several alias checks into one if possible.
3345    Return FALSE if resulting list of ddrs is longer then allowed by
3346    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3347
3348 bool
3349 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3350 {
3351   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3352   hash_set <tree_pair_hash> compared_objects;
3353
3354   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3355   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3356     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3357   vec<vec_object_pair> &check_unequal_addrs
3358     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3359   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3361
3362   ddr_p ddr;
3363   unsigned int i;
3364   tree length_factor;
3365
3366   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3367
3368   /* Step values are irrelevant for aliasing if the number of vector
3369      iterations is equal to the number of scalar iterations (which can
3370      happen for fully-SLP loops).  */
3371   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3372
3373   if (!ignore_step_p)
3374     {
3375       /* Convert the checks for nonzero steps into bound tests.  */
3376       tree value;
3377       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3378         vect_check_lower_bound (loop_vinfo, value, true, 1);
3379     }
3380
3381   if (may_alias_ddrs.is_empty ())
3382     return true;
3383
3384   comp_alias_ddrs.create (may_alias_ddrs.length ());
3385
3386   unsigned int loop_depth
3387     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3388                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3389
3390   /* First, we collect all data ref pairs for aliasing checks.  */
3391   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3392     {
3393       int comp_res;
3394       poly_uint64 lower_bound;
3395       struct data_reference *dr_a, *dr_b;
3396       gimple *dr_group_first_a, *dr_group_first_b;
3397       tree segment_length_a, segment_length_b;
3398       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3399       unsigned int align_a, align_b;
3400       gimple *stmt_a, *stmt_b;
3401
3402       /* Ignore the alias if the VF we chose ended up being no greater
3403          than the dependence distance.  */
3404       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3405         continue;
3406
3407       if (DDR_OBJECT_A (ddr))
3408         {
3409           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3410           if (!compared_objects.add (new_pair))
3411             {
3412               if (dump_enabled_p ())
3413                 {
3414                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3415                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3416                   dump_printf (MSG_NOTE, " and ");
3417                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3418                   dump_printf (MSG_NOTE, " have different addresses\n");
3419                 }
3420               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3421             }
3422           continue;
3423         }
3424
3425       dr_a = DDR_A (ddr);
3426       stmt_a = vect_dr_stmt (DDR_A (ddr));
3427
3428       dr_b = DDR_B (ddr);
3429       stmt_b = vect_dr_stmt (DDR_B (ddr));
3430
3431       /* Skip the pair if inter-iteration dependencies are irrelevant
3432          and intra-iteration dependencies are guaranteed to be honored.  */
3433       if (ignore_step_p
3434           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3435               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3436         {
3437           if (dump_enabled_p ())
3438             {
3439               dump_printf_loc (MSG_NOTE, vect_location,
3440                                "no need for alias check between ");
3441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3442               dump_printf (MSG_NOTE, " and ");
3443               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3444               dump_printf (MSG_NOTE, " when VF is 1\n");
3445             }
3446           continue;
3447         }
3448
3449       /* See whether we can handle the alias using a bounds check on
3450          the step, and whether that's likely to be the best approach.
3451          (It might not be, for example, if the minimum step is much larger
3452          than the number of bytes handled by one vector iteration.)  */
3453       if (!ignore_step_p
3454           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3455           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3456           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3457               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3458         {
3459           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3460           if (dump_enabled_p ())
3461             {
3462               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3463               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3464               dump_printf (MSG_NOTE, " and ");
3465               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3466               dump_printf (MSG_NOTE, " when the step ");
3467               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3468               dump_printf (MSG_NOTE, " is outside ");
3469               if (unsigned_p)
3470                 dump_printf (MSG_NOTE, "[0");
3471               else
3472                 {
3473                   dump_printf (MSG_NOTE, "(");
3474                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3475                 }
3476               dump_printf (MSG_NOTE, ", ");
3477               dump_dec (MSG_NOTE, lower_bound);
3478               dump_printf (MSG_NOTE, ")\n");
3479             }
3480           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3481                                   lower_bound);
3482           continue;
3483         }
3484
3485       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3486       if (dr_group_first_a)
3487         {
3488           stmt_a = dr_group_first_a;
3489           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3490         }
3491
3492       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3493       if (dr_group_first_b)
3494         {
3495           stmt_b = dr_group_first_b;
3496           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3497         }
3498
3499       if (ignore_step_p)
3500         {
3501           segment_length_a = size_zero_node;
3502           segment_length_b = size_zero_node;
3503         }
3504       else
3505         {
3506           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3507             length_factor = scalar_loop_iters;
3508           else
3509             length_factor = size_int (vect_factor);
3510           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3511           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3512         }
3513       access_size_a = vect_vfa_access_size (dr_a);
3514       access_size_b = vect_vfa_access_size (dr_b);
3515       align_a = vect_vfa_align (dr_a);
3516       align_b = vect_vfa_align (dr_b);
3517
3518       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3519                                         DR_BASE_ADDRESS (dr_b));
3520       if (comp_res == 0)
3521         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3522                                           DR_OFFSET (dr_b));
3523
3524       /* See whether the alias is known at compilation time.  */
3525       if (comp_res == 0
3526           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3527           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3528           && poly_int_tree_p (segment_length_a)
3529           && poly_int_tree_p (segment_length_b))
3530         {
3531           int res = vect_compile_time_alias (dr_a, dr_b,
3532                                              segment_length_a,
3533                                              segment_length_b,
3534                                              access_size_a,
3535                                              access_size_b);
3536           if (res >= 0 && dump_enabled_p ())
3537             {
3538               dump_printf_loc (MSG_NOTE, vect_location,
3539                                "can tell at compile time that ");
3540               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3541               dump_printf (MSG_NOTE, " and ");
3542               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3543               if (res == 0)
3544                 dump_printf (MSG_NOTE, " do not alias\n");
3545               else
3546                 dump_printf (MSG_NOTE, " alias\n");
3547             }
3548
3549           if (res == 0)
3550             continue;
3551
3552           if (res == 1)
3553             {
3554               if (dump_enabled_p ())
3555                 dump_printf_loc (MSG_NOTE, vect_location,
3556                                  "not vectorized: compilation time alias.\n");
3557               return false;
3558             }
3559         }
3560
3561       dr_with_seg_len_pair_t dr_with_seg_len_pair
3562         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3563          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3564
3565       /* Canonicalize pairs by sorting the two DR members.  */
3566       if (comp_res > 0)
3567         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3568
3569       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3570     }
3571
3572   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3573
3574   unsigned int count = (comp_alias_ddrs.length ()
3575                         + check_unequal_addrs.length ());
3576
3577   dump_printf_loc (MSG_NOTE, vect_location,
3578                    "improved number of alias checks from %d to %d\n",
3579                    may_alias_ddrs.length (), count);
3580   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3581     {
3582       if (dump_enabled_p ())
3583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584                          "number of versioning for alias "
3585                          "run-time tests exceeds %d "
3586                          "(--param vect-max-version-for-alias-checks)\n",
3587                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3588       return false;
3589     }
3590
3591   return true;
3592 }
3593
3594 /* Check whether we can use an internal function for a gather load
3595    or scatter store.  READ_P is true for loads and false for stores.
3596    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3597    the type of the memory elements being loaded or stored.  OFFSET_BITS
3598    is the number of bits in each scalar offset and OFFSET_SIGN is the
3599    sign of the offset.  SCALE is the amount by which the offset should
3600    be multiplied *after* it has been converted to address width.
3601
3602    Return true if the function is supported, storing the function
3603    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3604
3605 bool
3606 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3607                           tree memory_type, unsigned int offset_bits,
3608                           signop offset_sign, int scale,
3609                           internal_fn *ifn_out, tree *element_type_out)
3610 {
3611   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3612   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3613   if (offset_bits > element_bits)
3614     /* Internal functions require the offset to be the same width as
3615        the vector elements.  We can extend narrower offsets, but it isn't
3616        safe to truncate wider offsets.  */
3617     return false;
3618
3619   if (element_bits != memory_bits)
3620     /* For now the vector elements must be the same width as the
3621        memory elements.  */
3622     return false;
3623
3624   /* Work out which function we need.  */
3625   internal_fn ifn;
3626   if (read_p)
3627     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3628   else
3629     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3630
3631   /* Test whether the target supports this combination.  */
3632   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3633                                                offset_sign, scale))
3634     return false;
3635
3636   *ifn_out = ifn;
3637   *element_type_out = TREE_TYPE (vectype);
3638   return true;
3639 }
3640
3641 /* CALL is a call to an internal gather load or scatter store function.
3642    Describe the operation in INFO.  */
3643
3644 static void
3645 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3646 {
3647   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3648   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3649   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3650
3651   info->ifn = gimple_call_internal_fn (call);
3652   info->decl = NULL_TREE;
3653   info->base = gimple_call_arg (call, 0);
3654   info->offset = gimple_call_arg (call, 1);
3655   info->offset_dt = vect_unknown_def_type;
3656   info->offset_vectype = NULL_TREE;
3657   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3658   info->element_type = TREE_TYPE (vectype);
3659   info->memory_type = TREE_TYPE (DR_REF (dr));
3660 }
3661
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663    gather load or scatter store.  Describe the operation in *INFO if so.  */
3664
3665 bool
3666 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3667                            gather_scatter_info *info)
3668 {
3669   HOST_WIDE_INT scale = 1;
3670   poly_int64 pbitpos, pbitsize;
3671   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3672   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3673   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674   tree offtype = NULL_TREE;
3675   tree decl = NULL_TREE, base, off;
3676   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3677   tree memory_type = TREE_TYPE (DR_REF (dr));
3678   machine_mode pmode;
3679   int punsignedp, reversep, pvolatilep = 0;
3680   internal_fn ifn;
3681   tree element_type;
3682   bool masked_p = false;
3683
3684   /* See whether this is already a call to a gather/scatter internal function.
3685      If not, see whether it's a masked load or store.  */
3686   gcall *call = dyn_cast <gcall *> (stmt);
3687   if (call && gimple_call_internal_p (call))
3688     {
3689       ifn = gimple_call_internal_fn (stmt);
3690       if (internal_gather_scatter_fn_p (ifn))
3691         {
3692           vect_describe_gather_scatter_call (call, info);
3693           return true;
3694         }
3695       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3696     }
3697
3698   /* True if we should aim to use internal functions rather than
3699      built-in functions.  */
3700   bool use_ifn_p = (DR_IS_READ (dr)
3701                     ? supports_vec_gather_load_p ()
3702                     : supports_vec_scatter_store_p ());
3703
3704   base = DR_REF (dr);
3705   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706      see if we can use the def stmt of the address.  */
3707   if (masked_p
3708       && TREE_CODE (base) == MEM_REF
3709       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3710       && integer_zerop (TREE_OPERAND (base, 1))
3711       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3712     {
3713       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3714       if (is_gimple_assign (def_stmt)
3715           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3716         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3717     }
3718
3719   /* The gather and scatter builtins need address of the form
3720      loop_invariant + vector * {1, 2, 4, 8}
3721      or
3722      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725      multiplications and additions in it.  To get a vector, we need
3726      a single SSA_NAME that will be defined in the loop and will
3727      contain everything that is not loop invariant and that can be
3728      vectorized.  The following code attempts to find such a preexistng
3729      SSA_NAME OFF and put the loop invariants into a tree BASE
3730      that can be gimplified before the loop.  */
3731   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3732                               &punsignedp, &reversep, &pvolatilep);
3733   if (reversep)
3734     return false;
3735
3736   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3737
3738   if (TREE_CODE (base) == MEM_REF)
3739     {
3740       if (!integer_zerop (TREE_OPERAND (base, 1)))
3741         {
3742           if (off == NULL_TREE)
3743             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3744           else
3745             off = size_binop (PLUS_EXPR, off,
3746                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3747         }
3748       base = TREE_OPERAND (base, 0);
3749     }
3750   else
3751     base = build_fold_addr_expr (base);
3752
3753   if (off == NULL_TREE)
3754     off = size_zero_node;
3755
3756   /* If base is not loop invariant, either off is 0, then we start with just
3757      the constant offset in the loop invariant BASE and continue with base
3758      as OFF, otherwise give up.
3759      We could handle that case by gimplifying the addition of base + off
3760      into some SSA_NAME and use that as off, but for now punt.  */
3761   if (!expr_invariant_in_loop_p (loop, base))
3762     {
3763       if (!integer_zerop (off))
3764         return false;
3765       off = base;
3766       base = size_int (pbytepos);
3767     }
3768   /* Otherwise put base + constant offset into the loop invariant BASE
3769      and continue with OFF.  */
3770   else
3771     {
3772       base = fold_convert (sizetype, base);
3773       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3774     }
3775
3776   /* OFF at this point may be either a SSA_NAME or some tree expression
3777      from get_inner_reference.  Try to peel off loop invariants from it
3778      into BASE as long as possible.  */
3779   STRIP_NOPS (off);
3780   while (offtype == NULL_TREE)
3781     {
3782       enum tree_code code;
3783       tree op0, op1, add = NULL_TREE;
3784
3785       if (TREE_CODE (off) == SSA_NAME)
3786         {
3787           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3788
3789           if (expr_invariant_in_loop_p (loop, off))
3790             return false;
3791
3792           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3793             break;
3794
3795           op0 = gimple_assign_rhs1 (def_stmt);
3796           code = gimple_assign_rhs_code (def_stmt);
3797           op1 = gimple_assign_rhs2 (def_stmt);
3798         }
3799       else
3800         {
3801           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3802             return false;
3803           code = TREE_CODE (off);
3804           extract_ops_from_tree (off, &code, &op0, &op1);
3805         }
3806       switch (code)
3807         {
3808         case POINTER_PLUS_EXPR:
3809         case PLUS_EXPR:
3810           if (expr_invariant_in_loop_p (loop, op0))
3811             {
3812               add = op0;
3813               off = op1;
3814             do_add:
3815               add = fold_convert (sizetype, add);
3816               if (scale != 1)
3817                 add = size_binop (MULT_EXPR, add, size_int (scale));
3818               base = size_binop (PLUS_EXPR, base, add);
3819               continue;
3820             }
3821           if (expr_invariant_in_loop_p (loop, op1))
3822             {
3823               add = op1;
3824               off = op0;
3825               goto do_add;
3826             }
3827           break;
3828         case MINUS_EXPR:
3829           if (expr_invariant_in_loop_p (loop, op1))
3830             {
3831               add = fold_convert (sizetype, op1);
3832               add = size_binop (MINUS_EXPR, size_zero_node, add);
3833               off = op0;
3834               goto do_add;
3835             }
3836           break;
3837         case MULT_EXPR:
3838           if (scale == 1 && tree_fits_shwi_p (op1))
3839             {
3840               int new_scale = tree_to_shwi (op1);
3841               /* Only treat this as a scaling operation if the target
3842                  supports it.  */
3843               if (use_ifn_p
3844                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3845                                                 vectype, memory_type, 1,
3846                                                 TYPE_SIGN (TREE_TYPE (op0)),
3847                                                 new_scale, &ifn,
3848                                                 &element_type))
3849                 break;
3850               scale = new_scale;
3851               off = op0;
3852               continue;
3853             }
3854           break;
3855         case SSA_NAME:
3856           off = op0;
3857           continue;
3858         CASE_CONVERT:
3859           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3860               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3861             break;
3862           if (TYPE_PRECISION (TREE_TYPE (op0))
3863               == TYPE_PRECISION (TREE_TYPE (off)))
3864             {
3865               off = op0;
3866               continue;
3867             }
3868
3869           /* The internal functions need the offset to be the same width
3870              as the elements of VECTYPE.  Don't include operations that
3871              cast the offset from that width to a different width.  */
3872           if (use_ifn_p
3873               && (int_size_in_bytes (TREE_TYPE (vectype))
3874                   == int_size_in_bytes (TREE_TYPE (off))))
3875             break;
3876
3877           if (TYPE_PRECISION (TREE_TYPE (op0))
3878               < TYPE_PRECISION (TREE_TYPE (off)))
3879             {
3880               off = op0;
3881               offtype = TREE_TYPE (off);
3882               STRIP_NOPS (off);
3883               continue;
3884             }
3885           break;
3886         default:
3887           break;
3888         }
3889       break;
3890     }
3891
3892   /* If at the end OFF still isn't a SSA_NAME or isn't
3893      defined in the loop, punt.  */
3894   if (TREE_CODE (off) != SSA_NAME
3895       || expr_invariant_in_loop_p (loop, off))
3896     return false;
3897
3898   if (offtype == NULL_TREE)
3899     offtype = TREE_TYPE (off);
3900
3901   if (use_ifn_p)
3902     {
3903       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3904                                      memory_type, TYPE_PRECISION (offtype),
3905                                      TYPE_SIGN (offtype), scale, &ifn,
3906                                      &element_type))
3907         return false;
3908     }
3909   else
3910     {
3911       if (DR_IS_READ (dr))
3912         {
3913           if (targetm.vectorize.builtin_gather)
3914             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3915         }
3916       else
3917         {
3918           if (targetm.vectorize.builtin_scatter)
3919             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3920         }
3921
3922       if (!decl)
3923         return false;
3924
3925       ifn = IFN_LAST;
3926       element_type = TREE_TYPE (vectype);
3927     }
3928
3929   info->ifn = ifn;
3930   info->decl = decl;
3931   info->base = base;
3932   info->offset = off;
3933   info->offset_dt = vect_unknown_def_type;
3934   info->offset_vectype = NULL_TREE;
3935   info->scale = scale;
3936   info->element_type = element_type;
3937   info->memory_type = memory_type;
3938   return true;
3939 }
3940
3941 /* Find the data references in STMT, analyze them with respect to LOOP and
3942    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3943    be handled.  */
3944
3945 bool
3946 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3947                                vec<data_reference_p> *datarefs)
3948 {
3949   /* We can ignore clobbers for dataref analysis - they are removed during
3950      loop vectorization and BB vectorization checks dependences with a
3951      stmt walk.  */
3952   if (gimple_clobber_p (stmt))
3953     return true;
3954
3955   if (gimple_has_volatile_ops (stmt))
3956     {
3957       if (dump_enabled_p ())
3958         {
3959           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3960                            "not vectorized: volatile type ");
3961           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3962         }
3963       return false;
3964     }
3965
3966   if (stmt_can_throw_internal (stmt))
3967     {
3968       if (dump_enabled_p ())
3969         {
3970           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3971                            "not vectorized: statement can throw an "
3972                            "exception ");
3973           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3974         }
3975       return false;
3976     }
3977
3978   auto_vec<data_reference_p, 2> refs;
3979   if (!find_data_references_in_stmt (loop, stmt, &refs))
3980     return false;
3981
3982   if (refs.is_empty ())
3983     return true;
3984
3985   if (refs.length () > 1)
3986     {
3987       if (dump_enabled_p ())
3988         {
3989           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990                            "not vectorized: more than one data ref "
3991                            "in stmt: ");
3992           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3993         }
3994       return false;
3995     }
3996
3997   if (gcall *call = dyn_cast <gcall *> (stmt))
3998     if (!gimple_call_internal_p (call)
3999         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4000             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4001       {
4002         if (dump_enabled_p ())
4003           {
4004             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4005                              "not vectorized: dr in a call ");
4006             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4007           }
4008         return false;
4009       }
4010
4011   data_reference_p dr = refs.pop ();
4012   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4013       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4014     {
4015       if (dump_enabled_p ())
4016         {
4017           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4018                            "not vectorized: statement is bitfield "
4019                            "access ");
4020           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4021         }
4022       return false;
4023     }
4024
4025   if (DR_BASE_ADDRESS (dr)
4026       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4027     {
4028       if (dump_enabled_p ())
4029         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4030                          "not vectorized: base addr of dr is a "
4031                          "constant\n");
4032       return false;
4033     }
4034
4035   datarefs->safe_push (dr);
4036   return true;
4037 }
4038
4039 /* Function vect_analyze_data_refs.
4040
4041   Find all the data references in the loop or basic block.
4042
4043    The general structure of the analysis of data refs in the vectorizer is as
4044    follows:
4045    1- vect_analyze_data_refs(loop/bb): call
4046       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4047       in the loop/bb and their dependences.
4048    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4049    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4050    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4051
4052 */
4053
4054 bool
4055 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4056 {
4057   struct loop *loop = NULL;
4058   unsigned int i;
4059   struct data_reference *dr;
4060   tree scalar_type;
4061
4062   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4063
4064   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4065     loop = LOOP_VINFO_LOOP (loop_vinfo);
4066
4067   /* Go through the data-refs, check that the analysis succeeded.  Update
4068      pointer from stmt_vec_info struct to DR and vectype.  */
4069
4070   vec<data_reference_p> datarefs = vinfo->datarefs;
4071   FOR_EACH_VEC_ELT (datarefs, i, dr)
4072     {
4073       gimple *stmt;
4074       stmt_vec_info stmt_info;
4075       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4076       bool simd_lane_access = false;
4077       poly_uint64 vf;
4078
4079       gcc_assert (DR_REF (dr));
4080       stmt = vect_dr_stmt (dr);
4081       stmt_info = vinfo_for_stmt (stmt);
4082
4083       /* Check that analysis of the data-ref succeeded.  */
4084       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4085           || !DR_STEP (dr))
4086         {
4087           bool maybe_gather
4088             = DR_IS_READ (dr)
4089               && !TREE_THIS_VOLATILE (DR_REF (dr))
4090               && (targetm.vectorize.builtin_gather != NULL
4091                   || supports_vec_gather_load_p ());
4092           bool maybe_scatter
4093             = DR_IS_WRITE (dr)
4094               && !TREE_THIS_VOLATILE (DR_REF (dr))
4095               && (targetm.vectorize.builtin_scatter != NULL
4096                   || supports_vec_scatter_store_p ());
4097           bool maybe_simd_lane_access
4098             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4099
4100           /* If target supports vector gather loads or scatter stores, or if
4101              this might be a SIMD lane access, see if they can't be used.  */
4102           if (is_a <loop_vec_info> (vinfo)
4103               && !nested_in_vect_loop_p (loop, stmt))
4104             {
4105               if (maybe_simd_lane_access)
4106                 {
4107                   struct data_reference *newdr
4108                     = create_data_ref (NULL, loop_containing_stmt (stmt),
4109                                        DR_REF (dr), stmt, !maybe_scatter,
4110                                        DR_IS_CONDITIONAL_IN_STMT (dr));
4111                   gcc_assert (newdr != NULL && DR_REF (newdr));
4112                   if (DR_BASE_ADDRESS (newdr)
4113                       && DR_OFFSET (newdr)
4114                       && DR_INIT (newdr)
4115                       && DR_STEP (newdr)
4116                       && integer_zerop (DR_STEP (newdr)))
4117                     {
4118                       tree off = DR_OFFSET (newdr);
4119                       STRIP_NOPS (off);
4120                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4121                           && TREE_CODE (off) == MULT_EXPR
4122                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4123                         {
4124                           tree step = TREE_OPERAND (off, 1);
4125                           off = TREE_OPERAND (off, 0);
4126                           STRIP_NOPS (off);
4127                           if (CONVERT_EXPR_P (off)
4128                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4129                                                                           0)))
4130                               < TYPE_PRECISION (TREE_TYPE (off)))
4131                             off = TREE_OPERAND (off, 0);
4132                           if (TREE_CODE (off) == SSA_NAME)
4133                             {
4134                               gimple *def = SSA_NAME_DEF_STMT (off);
4135                               tree reft = TREE_TYPE (DR_REF (newdr));
4136                               if (is_gimple_call (def)
4137                                   && gimple_call_internal_p (def)
4138                                   && (gimple_call_internal_fn (def)
4139                                       == IFN_GOMP_SIMD_LANE))
4140                                 {
4141                                   tree arg = gimple_call_arg (def, 0);
4142                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4143                                   arg = SSA_NAME_VAR (arg);
4144                                   if (arg == loop->simduid
4145                                       /* For now.  */
4146                                       && tree_int_cst_equal
4147                                       (TYPE_SIZE_UNIT (reft),
4148                                        step))
4149                                     {
4150                                       DR_OFFSET (newdr) = ssize_int (0);
4151                                       DR_STEP (newdr) = step;
4152                                       DR_OFFSET_ALIGNMENT (newdr)
4153                                           = BIGGEST_ALIGNMENT;
4154                                       DR_STEP_ALIGNMENT (newdr)
4155                                           = highest_pow2_factor (step);
4156                                       dr = newdr;
4157                                       simd_lane_access = true;
4158                                     }
4159                                 }
4160                             }
4161                         }
4162                     }
4163                   if (!simd_lane_access)
4164                     free_data_ref (newdr);
4165                 }
4166               if (!simd_lane_access && (maybe_gather || maybe_scatter))
4167                 {
4168                   if (maybe_gather)
4169                     gatherscatter = GATHER;
4170                   else
4171                     gatherscatter = SCATTER;
4172                 }
4173             }
4174
4175           if (gatherscatter == SG_NONE && !simd_lane_access)
4176             {
4177               if (dump_enabled_p ())
4178                 {
4179                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4180                                    "not vectorized: data ref analysis "
4181                                    "failed ");
4182                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4183                 }
4184               if (is_a <bb_vec_info> (vinfo))
4185                 {
4186                   /* In BB vectorization the ref can still participate
4187                      in dependence analysis, we just can't vectorize it.  */
4188                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4189                   continue;
4190                 }
4191               return false;
4192             }
4193         }
4194
4195       tree base = get_base_address (DR_REF (dr));
4196       if (base && VAR_P (base) && DECL_NONALIASED (base))
4197         {
4198           if (dump_enabled_p ())
4199             {
4200               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4201                                "not vectorized: base object not addressable "
4202                                "for stmt: ");
4203               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4204             }
4205           if (is_a <bb_vec_info> (vinfo))
4206             {
4207               /* In BB vectorization the ref can still participate
4208                  in dependence analysis, we just can't vectorize it.  */
4209               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4210               continue;
4211             }
4212           return false;
4213         }
4214
4215       if (is_a <loop_vec_info> (vinfo)
4216           && DR_STEP (dr)
4217           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4218         {
4219           if (nested_in_vect_loop_p (loop, stmt))
4220             {
4221               if (dump_enabled_p ())
4222                 {
4223                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224                                    "not vectorized: not suitable for strided "
4225                                    "load ");
4226                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4227                 }
4228               return false;
4229             }
4230           STMT_VINFO_STRIDED_P (stmt_info) = true;
4231         }
4232
4233       /* Update DR field in stmt_vec_info struct.  */
4234
4235       /* If the dataref is in an inner-loop of the loop that is considered for
4236          for vectorization, we also want to analyze the access relative to
4237          the outer-loop (DR contains information only relative to the
4238          inner-most enclosing loop).  We do that by building a reference to the
4239          first location accessed by the inner-loop, and analyze it relative to
4240          the outer-loop.  */
4241       if (loop && nested_in_vect_loop_p (loop, stmt))
4242         {
4243           /* Build a reference to the first location accessed by the
4244              inner loop: *(BASE + INIT + OFFSET).  By construction,
4245              this address must be invariant in the inner loop, so we
4246              can consider it as being used in the outer loop.  */
4247           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4248           tree offset = unshare_expr (DR_OFFSET (dr));
4249           tree init = unshare_expr (DR_INIT (dr));
4250           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4251                                           init, offset);
4252           tree init_addr = fold_build_pointer_plus (base, init_offset);
4253           tree init_ref = build_fold_indirect_ref (init_addr);
4254
4255           if (dump_enabled_p ())
4256             {
4257               dump_printf_loc (MSG_NOTE, vect_location,
4258                                "analyze in outer loop: ");
4259               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4260               dump_printf (MSG_NOTE, "\n");
4261             }
4262
4263           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4264                                      init_ref, loop))
4265             /* dr_analyze_innermost already explained the failure.  */
4266             return false;
4267
4268           if (dump_enabled_p ())
4269             {
4270               dump_printf_loc (MSG_NOTE, vect_location,
4271                                "\touter base_address: ");
4272               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4273                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4274               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4275               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4276                                  STMT_VINFO_DR_OFFSET (stmt_info));
4277               dump_printf (MSG_NOTE,
4278                            "\n\touter constant offset from base address: ");
4279               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280                                  STMT_VINFO_DR_INIT (stmt_info));
4281               dump_printf (MSG_NOTE, "\n\touter step: ");
4282               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4283                                  STMT_VINFO_DR_STEP (stmt_info));
4284               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4285                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4286               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4287                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4288               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4289                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4290               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4291                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4292             }
4293         }
4294
4295       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4296       STMT_VINFO_DATA_REF (stmt_info) = dr;
4297       if (simd_lane_access)
4298         {
4299           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4300           free_data_ref (datarefs[i]);
4301           datarefs[i] = dr;
4302         }
4303
4304       /* Set vectype for STMT.  */
4305       scalar_type = TREE_TYPE (DR_REF (dr));
4306       STMT_VINFO_VECTYPE (stmt_info)
4307         = get_vectype_for_scalar_type (scalar_type);
4308       if (!STMT_VINFO_VECTYPE (stmt_info))
4309         {
4310           if (dump_enabled_p ())
4311             {
4312               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4313                                "not vectorized: no vectype for stmt: ");
4314               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4315               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4316               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4317                                  scalar_type);
4318               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4319             }
4320
4321           if (is_a <bb_vec_info> (vinfo))
4322             {
4323               /* No vector type is fine, the ref can still participate
4324                  in dependence analysis, we just can't vectorize it.  */
4325               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4326               continue;
4327             }
4328
4329           if (simd_lane_access)
4330             {
4331               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4332               free_data_ref (dr);
4333             }
4334           return false;
4335         }
4336       else
4337         {
4338           if (dump_enabled_p ())
4339             {
4340               dump_printf_loc (MSG_NOTE, vect_location,
4341                                "got vectype for stmt: ");
4342               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4343               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4344                                  STMT_VINFO_VECTYPE (stmt_info));
4345               dump_printf (MSG_NOTE, "\n");
4346             }
4347         }
4348
4349       /* Adjust the minimal vectorization factor according to the
4350          vector type.  */
4351       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4352       *min_vf = upper_bound (*min_vf, vf);
4353
4354       if (gatherscatter != SG_NONE)
4355         {
4356           gather_scatter_info gs_info;
4357           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4358                                           &gs_info)
4359               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4360             {
4361               if (dump_enabled_p ())
4362                 {
4363                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4364                                    (gatherscatter == GATHER) ?
4365                                    "not vectorized: not suitable for gather "
4366                                    "load " :
4367                                    "not vectorized: not suitable for scatter "
4368                                    "store ");
4369                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4370                 }
4371               return false;
4372             }
4373           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4374         }
4375     }
4376
4377   /* We used to stop processing and prune the list here.  Verify we no
4378      longer need to.  */
4379   gcc_assert (i == datarefs.length ());
4380
4381   return true;
4382 }
4383
4384
4385 /* Function vect_get_new_vect_var.
4386
4387    Returns a name for a new variable.  The current naming scheme appends the
4388    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4389    the name of vectorizer generated variables, and appends that to NAME if
4390    provided.  */
4391
4392 tree
4393 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4394 {
4395   const char *prefix;
4396   tree new_vect_var;
4397
4398   switch (var_kind)
4399   {
4400   case vect_simple_var:
4401     prefix = "vect";
4402     break;
4403   case vect_scalar_var:
4404     prefix = "stmp";
4405     break;
4406   case vect_mask_var:
4407     prefix = "mask";
4408     break;
4409   case vect_pointer_var:
4410     prefix = "vectp";
4411     break;
4412   default:
4413     gcc_unreachable ();
4414   }
4415
4416   if (name)
4417     {
4418       char* tmp = concat (prefix, "_", name, NULL);
4419       new_vect_var = create_tmp_reg (type, tmp);
4420       free (tmp);
4421     }
4422   else
4423     new_vect_var = create_tmp_reg (type, prefix);
4424
4425   return new_vect_var;
4426 }
4427
4428 /* Like vect_get_new_vect_var but return an SSA name.  */
4429
4430 tree
4431 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4432 {
4433   const char *prefix;
4434   tree new_vect_var;
4435
4436   switch (var_kind)
4437   {
4438   case vect_simple_var:
4439     prefix = "vect";
4440     break;
4441   case vect_scalar_var:
4442     prefix = "stmp";
4443     break;
4444   case vect_pointer_var:
4445     prefix = "vectp";
4446     break;
4447   default:
4448     gcc_unreachable ();
4449   }
4450
4451   if (name)
4452     {
4453       char* tmp = concat (prefix, "_", name, NULL);
4454       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4455       free (tmp);
4456     }
4457   else
4458     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4459
4460   return new_vect_var;
4461 }
4462
4463 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4464
4465 static void
4466 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4467 {
4468   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4469   int misalign = DR_MISALIGNMENT (dr);
4470   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4471     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4472   else
4473     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4474                             DR_TARGET_ALIGNMENT (dr), misalign);
4475 }
4476
4477 /* Function vect_create_addr_base_for_vector_ref.
4478
4479    Create an expression that computes the address of the first memory location
4480    that will be accessed for a data reference.
4481
4482    Input:
4483    STMT: The statement containing the data reference.
4484    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4485    OFFSET: Optional. If supplied, it is be added to the initial address.
4486    LOOP:    Specify relative to which loop-nest should the address be computed.
4487             For example, when the dataref is in an inner-loop nested in an
4488             outer-loop that is now being vectorized, LOOP can be either the
4489             outer-loop, or the inner-loop.  The first memory location accessed
4490             by the following dataref ('in' points to short):
4491
4492                 for (i=0; i<N; i++)
4493                    for (j=0; j<M; j++)
4494                      s += in[i+j]
4495
4496             is as follows:
4497             if LOOP=i_loop:     &in             (relative to i_loop)
4498             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4499    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4500             initial address.  Unlike OFFSET, which is number of elements to
4501             be added, BYTE_OFFSET is measured in bytes.
4502
4503    Output:
4504    1. Return an SSA_NAME whose value is the address of the memory location of
4505       the first vector of the data reference.
4506    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4507       these statement(s) which define the returned SSA_NAME.
4508
4509    FORNOW: We are only handling array accesses with step 1.  */
4510
4511 tree
4512 vect_create_addr_base_for_vector_ref (gimple *stmt,
4513                                       gimple_seq *new_stmt_list,
4514                                       tree offset,
4515                                       tree byte_offset)
4516 {
4517   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4518   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4519   const char *base_name;
4520   tree addr_base;
4521   tree dest;
4522   gimple_seq seq = NULL;
4523   tree vect_ptr_type;
4524   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4525   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4526   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4527
4528   tree data_ref_base = unshare_expr (drb->base_address);
4529   tree base_offset = unshare_expr (drb->offset);
4530   tree init = unshare_expr (drb->init);
4531
4532   if (loop_vinfo)
4533     base_name = get_name (data_ref_base);
4534   else
4535     {
4536       base_offset = ssize_int (0);
4537       init = ssize_int (0);
4538       base_name = get_name (DR_REF (dr));
4539     }
4540
4541   /* Create base_offset */
4542   base_offset = size_binop (PLUS_EXPR,
4543                             fold_convert (sizetype, base_offset),
4544                             fold_convert (sizetype, init));
4545
4546   if (offset)
4547     {
4548       offset = fold_build2 (MULT_EXPR, sizetype,
4549                             fold_convert (sizetype, offset), step);
4550       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4551                                  base_offset, offset);
4552     }
4553   if (byte_offset)
4554     {
4555       byte_offset = fold_convert (sizetype, byte_offset);
4556       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4557                                  base_offset, byte_offset);
4558     }
4559
4560   /* base + base_offset */
4561   if (loop_vinfo)
4562     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4563   else
4564     {
4565       addr_base = build1 (ADDR_EXPR,
4566                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4567                           unshare_expr (DR_REF (dr)));
4568     }
4569
4570   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4571   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4572   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4573   gimple_seq_add_seq (new_stmt_list, seq);
4574
4575   if (DR_PTR_INFO (dr)
4576       && TREE_CODE (addr_base) == SSA_NAME
4577       && !SSA_NAME_PTR_INFO (addr_base))
4578     {
4579       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4580       if (offset || byte_offset)
4581         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4582     }
4583
4584   if (dump_enabled_p ())
4585     {
4586       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4587       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4588       dump_printf (MSG_NOTE, "\n");
4589     }
4590
4591   return addr_base;
4592 }
4593
4594
4595 /* Function vect_create_data_ref_ptr.
4596
4597    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4598    location accessed in the loop by STMT, along with the def-use update
4599    chain to appropriately advance the pointer through the loop iterations.
4600    Also set aliasing information for the pointer.  This pointer is used by
4601    the callers to this function to create a memory reference expression for
4602    vector load/store access.
4603
4604    Input:
4605    1. STMT: a stmt that references memory. Expected to be of the form
4606          GIMPLE_ASSIGN <name, data-ref> or
4607          GIMPLE_ASSIGN <data-ref, name>.
4608    2. AGGR_TYPE: the type of the reference, which should be either a vector
4609         or an array.
4610    3. AT_LOOP: the loop where the vector memref is to be created.
4611    4. OFFSET (optional): an offset to be added to the initial address accessed
4612         by the data-ref in STMT.
4613    5. BSI: location where the new stmts are to be placed if there is no loop
4614    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4615         pointing to the initial address.
4616    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4617         to the initial address accessed by the data-ref in STMT.  This is
4618         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4619         in bytes.
4620    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4621         to the IV during each iteration of the loop.  NULL says to move
4622         by one copy of AGGR_TYPE up or down, depending on the step of the
4623         data reference.
4624
4625    Output:
4626    1. Declare a new ptr to vector_type, and have it point to the base of the
4627       data reference (initial addressed accessed by the data reference).
4628       For example, for vector of type V8HI, the following code is generated:
4629
4630       v8hi *ap;
4631       ap = (v8hi *)initial_address;
4632
4633       if OFFSET is not supplied:
4634          initial_address = &a[init];
4635       if OFFSET is supplied:
4636          initial_address = &a[init + OFFSET];
4637       if BYTE_OFFSET is supplied:
4638          initial_address = &a[init] + BYTE_OFFSET;
4639
4640       Return the initial_address in INITIAL_ADDRESS.
4641
4642    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4643       update the pointer in each iteration of the loop.
4644
4645       Return the increment stmt that updates the pointer in PTR_INCR.
4646
4647    3. Set INV_P to true if the access pattern of the data reference in the
4648       vectorized loop is invariant.  Set it to false otherwise.
4649
4650    4. Return the pointer.  */
4651
4652 tree
4653 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4654                           tree offset, tree *initial_address,
4655                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4656                           bool only_init, bool *inv_p, tree byte_offset,
4657                           tree iv_step)
4658 {
4659   const char *base_name;
4660   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4661   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4662   struct loop *loop = NULL;
4663   bool nested_in_vect_loop = false;
4664   struct loop *containing_loop = NULL;
4665   tree aggr_ptr_type;
4666   tree aggr_ptr;
4667   tree new_temp;
4668   gimple_seq new_stmt_list = NULL;
4669   edge pe = NULL;
4670   basic_block new_bb;
4671   tree aggr_ptr_init;
4672   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4673   tree aptr;
4674   gimple_stmt_iterator incr_gsi;
4675   bool insert_after;
4676   tree indx_before_incr, indx_after_incr;
4677   gimple *incr;
4678   tree step;
4679   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4680
4681   gcc_assert (iv_step != NULL_TREE
4682               || TREE_CODE (aggr_type) == ARRAY_TYPE
4683               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4684
4685   if (loop_vinfo)
4686     {
4687       loop = LOOP_VINFO_LOOP (loop_vinfo);
4688       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4689       containing_loop = (gimple_bb (stmt))->loop_father;
4690       pe = loop_preheader_edge (loop);
4691     }
4692   else
4693     {
4694       gcc_assert (bb_vinfo);
4695       only_init = true;
4696       *ptr_incr = NULL;
4697     }
4698
4699   /* Check the step (evolution) of the load in LOOP, and record
4700      whether it's invariant.  */
4701   step = vect_dr_behavior (dr)->step;
4702   if (integer_zerop (step))
4703     *inv_p = true;
4704   else
4705     *inv_p = false;
4706
4707   /* Create an expression for the first address accessed by this load
4708      in LOOP.  */
4709   base_name = get_name (DR_BASE_ADDRESS (dr));
4710
4711   if (dump_enabled_p ())
4712     {
4713       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4714       dump_printf_loc (MSG_NOTE, vect_location,
4715                        "create %s-pointer variable to type: ",
4716                        get_tree_code_name (TREE_CODE (aggr_type)));
4717       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4718       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4719         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4720       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4721         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4722       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4723         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4724       else
4725         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4726       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4727       dump_printf (MSG_NOTE, "\n");
4728     }
4729
4730   /* (1) Create the new aggregate-pointer variable.
4731      Vector and array types inherit the alias set of their component
4732      type by default so we need to use a ref-all pointer if the data
4733      reference does not conflict with the created aggregated data
4734      reference because it is not addressable.  */
4735   bool need_ref_all = false;
4736   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4737                               get_alias_set (DR_REF (dr))))
4738     need_ref_all = true;
4739   /* Likewise for any of the data references in the stmt group.  */
4740   else if (DR_GROUP_SIZE (stmt_info) > 1)
4741     {
4742       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4743       do
4744         {
4745           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4746           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4747           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4748                                       get_alias_set (DR_REF (sdr))))
4749             {
4750               need_ref_all = true;
4751               break;
4752             }
4753           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4754         }
4755       while (orig_stmt);
4756     }
4757   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4758                                                need_ref_all);
4759   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4760
4761
4762   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4763      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4764      def-use update cycles for the pointer: one relative to the outer-loop
4765      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4766      to the inner-loop (which is the inner-most loop containing the dataref),
4767      and this is done be step (5) below.
4768
4769      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4770      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4771      redundant.  Steps (3),(4) create the following:
4772
4773         vp0 = &base_addr;
4774         LOOP:   vp1 = phi(vp0,vp2)
4775                 ...
4776                 ...
4777                 vp2 = vp1 + step
4778                 goto LOOP
4779
4780      If there is an inner-loop nested in loop, then step (5) will also be
4781      applied, and an additional update in the inner-loop will be created:
4782
4783         vp0 = &base_addr;
4784         LOOP:   vp1 = phi(vp0,vp2)
4785                 ...
4786         inner:     vp3 = phi(vp1,vp4)
4787                    vp4 = vp3 + inner_step
4788                    if () goto inner
4789                 ...
4790                 vp2 = vp1 + step
4791                 if () goto LOOP   */
4792
4793   /* (2) Calculate the initial address of the aggregate-pointer, and set
4794      the aggregate-pointer to point to it before the loop.  */
4795
4796   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4797
4798   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4799                                                    offset, byte_offset);
4800   if (new_stmt_list)
4801     {
4802       if (pe)
4803         {
4804           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4805           gcc_assert (!new_bb);
4806         }
4807       else
4808         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4809     }
4810
4811   *initial_address = new_temp;
4812   aggr_ptr_init = new_temp;
4813
4814   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4815      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4816      inner-loop nested in LOOP (during outer-loop vectorization).  */
4817
4818   /* No update in loop is required.  */
4819   if (only_init && (!loop_vinfo || at_loop == loop))
4820     aptr = aggr_ptr_init;
4821   else
4822     {
4823       if (iv_step == NULL_TREE)
4824         {
4825           /* The step of the aggregate pointer is the type size.  */
4826           iv_step = TYPE_SIZE_UNIT (aggr_type);
4827           /* One exception to the above is when the scalar step of the load in
4828              LOOP is zero. In this case the step here is also zero.  */
4829           if (*inv_p)
4830             iv_step = size_zero_node;
4831           else if (tree_int_cst_sgn (step) == -1)
4832             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4833         }
4834
4835       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4836
4837       create_iv (aggr_ptr_init,
4838                  fold_convert (aggr_ptr_type, iv_step),
4839                  aggr_ptr, loop, &incr_gsi, insert_after,
4840                  &indx_before_incr, &indx_after_incr);
4841       incr = gsi_stmt (incr_gsi);
4842       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4843
4844       /* Copy the points-to information if it exists. */
4845       if (DR_PTR_INFO (dr))
4846         {
4847           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4848           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4849         }
4850       if (ptr_incr)
4851         *ptr_incr = incr;
4852
4853       aptr = indx_before_incr;
4854     }
4855
4856   if (!nested_in_vect_loop || only_init)
4857     return aptr;
4858
4859
4860   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4861      nested in LOOP, if exists.  */
4862
4863   gcc_assert (nested_in_vect_loop);
4864   if (!only_init)
4865     {
4866       standard_iv_increment_position (containing_loop, &incr_gsi,
4867                                       &insert_after);
4868       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4869                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4870                  &indx_after_incr);
4871       incr = gsi_stmt (incr_gsi);
4872       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4873
4874       /* Copy the points-to information if it exists. */
4875       if (DR_PTR_INFO (dr))
4876         {
4877           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4878           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4879         }
4880       if (ptr_incr)
4881         *ptr_incr = incr;
4882
4883       return indx_before_incr;
4884     }
4885   else
4886     gcc_unreachable ();
4887 }
4888
4889
4890 /* Function bump_vector_ptr
4891
4892    Increment a pointer (to a vector type) by vector-size. If requested,
4893    i.e. if PTR-INCR is given, then also connect the new increment stmt
4894    to the existing def-use update-chain of the pointer, by modifying
4895    the PTR_INCR as illustrated below:
4896
4897    The pointer def-use update-chain before this function:
4898                         DATAREF_PTR = phi (p_0, p_2)
4899                         ....
4900         PTR_INCR:       p_2 = DATAREF_PTR + step
4901
4902    The pointer def-use update-chain after this function:
4903                         DATAREF_PTR = phi (p_0, p_2)
4904                         ....
4905                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4906                         ....
4907         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4908
4909    Input:
4910    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4911                  in the loop.
4912    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4913               the loop.  The increment amount across iterations is expected
4914               to be vector_size.
4915    BSI - location where the new update stmt is to be placed.
4916    STMT - the original scalar memory-access stmt that is being vectorized.
4917    BUMP - optional. The offset by which to bump the pointer. If not given,
4918           the offset is assumed to be vector_size.
4919
4920    Output: Return NEW_DATAREF_PTR as illustrated above.
4921
4922 */
4923
4924 tree
4925 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4926                  gimple *stmt, tree bump)
4927 {
4928   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4929   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4930   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4931   tree update = TYPE_SIZE_UNIT (vectype);
4932   gassign *incr_stmt;
4933   ssa_op_iter iter;
4934   use_operand_p use_p;
4935   tree new_dataref_ptr;
4936
4937   if (bump)
4938     update = bump;
4939
4940   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4941     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4942   else
4943     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4944   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4945                                    dataref_ptr, update);
4946   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4947
4948   /* Copy the points-to information if it exists. */
4949   if (DR_PTR_INFO (dr))
4950     {
4951       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4952       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4953     }
4954
4955   if (!ptr_incr)
4956     return new_dataref_ptr;
4957
4958   /* Update the vector-pointer's cross-iteration increment.  */
4959   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4960     {
4961       tree use = USE_FROM_PTR (use_p);
4962
4963       if (use == dataref_ptr)
4964         SET_USE (use_p, new_dataref_ptr);
4965       else
4966         gcc_assert (operand_equal_p (use, update, 0));
4967     }
4968
4969   return new_dataref_ptr;
4970 }
4971
4972
4973 /* Copy memory reference info such as base/clique from the SRC reference
4974    to the DEST MEM_REF.  */
4975
4976 void
4977 vect_copy_ref_info (tree dest, tree src)
4978 {
4979   if (TREE_CODE (dest) != MEM_REF)
4980     return;
4981
4982   tree src_base = src;
4983   while (handled_component_p (src_base))
4984     src_base = TREE_OPERAND (src_base, 0);
4985   if (TREE_CODE (src_base) != MEM_REF
4986       && TREE_CODE (src_base) != TARGET_MEM_REF)
4987     return;
4988
4989   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4990   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4991 }
4992
4993
4994 /* Function vect_create_destination_var.
4995
4996    Create a new temporary of type VECTYPE.  */
4997
4998 tree
4999 vect_create_destination_var (tree scalar_dest, tree vectype)
5000 {
5001   tree vec_dest;
5002   const char *name;
5003   char *new_name;
5004   tree type;
5005   enum vect_var_kind kind;
5006
5007   kind = vectype
5008     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5009     ? vect_mask_var
5010     : vect_simple_var
5011     : vect_scalar_var;
5012   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5013
5014   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5015
5016   name = get_name (scalar_dest);
5017   if (name)
5018     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5019   else
5020     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5021   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5022   free (new_name);
5023
5024   return vec_dest;
5025 }
5026
5027 /* Function vect_grouped_store_supported.
5028
5029    Returns TRUE if interleave high and interleave low permutations
5030    are supported, and FALSE otherwise.  */
5031
5032 bool
5033 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5034 {
5035   machine_mode mode = TYPE_MODE (vectype);
5036
5037   /* vect_permute_store_chain requires the group size to be equal to 3 or
5038      be a power of two.  */
5039   if (count != 3 && exact_log2 (count) == -1)
5040     {
5041       if (dump_enabled_p ())
5042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5043                          "the size of the group of accesses"
5044                          " is not a power of 2 or not eqaul to 3\n");
5045       return false;
5046     }
5047
5048   /* Check that the permutation is supported.  */
5049   if (VECTOR_MODE_P (mode))
5050     {
5051       unsigned int i;
5052       if (count == 3)
5053         {
5054           unsigned int j0 = 0, j1 = 0, j2 = 0;
5055           unsigned int i, j;
5056
5057           unsigned int nelt;
5058           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5059             {
5060               if (dump_enabled_p ())
5061                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062                                  "cannot handle groups of 3 stores for"
5063                                  " variable-length vectors\n");
5064               return false;
5065             }
5066
5067           vec_perm_builder sel (nelt, nelt, 1);
5068           sel.quick_grow (nelt);
5069           vec_perm_indices indices;
5070           for (j = 0; j < 3; j++)
5071             {
5072               int nelt0 = ((3 - j) * nelt) % 3;
5073               int nelt1 = ((3 - j) * nelt + 1) % 3;
5074               int nelt2 = ((3 - j) * nelt + 2) % 3;
5075               for (i = 0; i < nelt; i++)
5076                 {
5077                   if (3 * i + nelt0 < nelt)
5078                     sel[3 * i + nelt0] = j0++;
5079                   if (3 * i + nelt1 < nelt)
5080                     sel[3 * i + nelt1] = nelt + j1++;
5081                   if (3 * i + nelt2 < nelt)
5082                     sel[3 * i + nelt2] = 0;
5083                 }
5084               indices.new_vector (sel, 2, nelt);
5085               if (!can_vec_perm_const_p (mode, indices))
5086                 {
5087                   if (dump_enabled_p ())
5088                     dump_printf (MSG_MISSED_OPTIMIZATION,
5089                                  "permutation op not supported by target.\n");
5090                   return false;
5091                 }
5092
5093               for (i = 0; i < nelt; i++)
5094                 {
5095                   if (3 * i + nelt0 < nelt)
5096                     sel[3 * i + nelt0] = 3 * i + nelt0;
5097                   if (3 * i + nelt1 < nelt)
5098                     sel[3 * i + nelt1] = 3 * i + nelt1;
5099                   if (3 * i + nelt2 < nelt)
5100                     sel[3 * i + nelt2] = nelt + j2++;
5101                 }
5102               indices.new_vector (sel, 2, nelt);
5103               if (!can_vec_perm_const_p (mode, indices))
5104                 {
5105                   if (dump_enabled_p ())
5106                     dump_printf (MSG_MISSED_OPTIMIZATION,
5107                                  "permutation op not supported by target.\n");
5108                   return false;
5109                 }
5110             }
5111           return true;
5112         }
5113       else
5114         {
5115           /* If length is not equal to 3 then only power of 2 is supported.  */
5116           gcc_assert (pow2p_hwi (count));
5117           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5118
5119           /* The encoding has 2 interleaved stepped patterns.  */
5120           vec_perm_builder sel (nelt, 2, 3);
5121           sel.quick_grow (6);
5122           for (i = 0; i < 3; i++)
5123             {
5124               sel[i * 2] = i;
5125               sel[i * 2 + 1] = i + nelt;
5126             }
5127           vec_perm_indices indices (sel, 2, nelt);
5128           if (can_vec_perm_const_p (mode, indices))
5129             {
5130               for (i = 0; i < 6; i++)
5131                 sel[i] += exact_div (nelt, 2);
5132               indices.new_vector (sel, 2, nelt);
5133               if (can_vec_perm_const_p (mode, indices))
5134                 return true;
5135             }
5136         }
5137     }
5138
5139   if (dump_enabled_p ())
5140     dump_printf (MSG_MISSED_OPTIMIZATION,
5141                  "permutaion op not supported by target.\n");
5142   return false;
5143 }
5144
5145
5146 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5147    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5148
5149 bool
5150 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5151                             bool masked_p)
5152 {
5153   if (masked_p)
5154     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5155                                          vec_mask_store_lanes_optab,
5156                                          vectype, count);
5157   else
5158     return vect_lanes_optab_supported_p ("vec_store_lanes",
5159                                          vec_store_lanes_optab,
5160                                          vectype, count);
5161 }
5162
5163
5164 /* Function vect_permute_store_chain.
5165
5166    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5167    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5168    the data correctly for the stores.  Return the final references for stores
5169    in RESULT_CHAIN.
5170
5171    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5172    The input is 4 vectors each containing 8 elements.  We assign a number to
5173    each element, the input sequence is:
5174
5175    1st vec:   0  1  2  3  4  5  6  7
5176    2nd vec:   8  9 10 11 12 13 14 15
5177    3rd vec:  16 17 18 19 20 21 22 23
5178    4th vec:  24 25 26 27 28 29 30 31
5179
5180    The output sequence should be:
5181
5182    1st vec:  0  8 16 24  1  9 17 25
5183    2nd vec:  2 10 18 26  3 11 19 27
5184    3rd vec:  4 12 20 28  5 13 21 30
5185    4th vec:  6 14 22 30  7 15 23 31
5186
5187    i.e., we interleave the contents of the four vectors in their order.
5188
5189    We use interleave_high/low instructions to create such output.  The input of
5190    each interleave_high/low operation is two vectors:
5191    1st vec    2nd vec
5192    0 1 2 3    4 5 6 7
5193    the even elements of the result vector are obtained left-to-right from the
5194    high/low elements of the first vector.  The odd elements of the result are
5195    obtained left-to-right from the high/low elements of the second vector.
5196    The output of interleave_high will be:   0 4 1 5
5197    and of interleave_low:                   2 6 3 7
5198
5199
5200    The permutation is done in log LENGTH stages.  In each stage interleave_high
5201    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5202    where the first argument is taken from the first half of DR_CHAIN and the
5203    second argument from it's second half.
5204    In our example,
5205
5206    I1: interleave_high (1st vec, 3rd vec)
5207    I2: interleave_low (1st vec, 3rd vec)
5208    I3: interleave_high (2nd vec, 4th vec)
5209    I4: interleave_low (2nd vec, 4th vec)
5210
5211    The output for the first stage is:
5212
5213    I1:  0 16  1 17  2 18  3 19
5214    I2:  4 20  5 21  6 22  7 23
5215    I3:  8 24  9 25 10 26 11 27
5216    I4: 12 28 13 29 14 30 15 31
5217
5218    The output of the second stage, i.e. the final result is:
5219
5220    I1:  0  8 16 24  1  9 17 25
5221    I2:  2 10 18 26  3 11 19 27
5222    I3:  4 12 20 28  5 13 21 30
5223    I4:  6 14 22 30  7 15 23 31.  */
5224
5225 void
5226 vect_permute_store_chain (vec<tree> dr_chain,
5227                           unsigned int length,
5228                           gimple *stmt,
5229                           gimple_stmt_iterator *gsi,
5230                           vec<tree> *result_chain)
5231 {
5232   tree vect1, vect2, high, low;
5233   gimple *perm_stmt;
5234   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5235   tree perm_mask_low, perm_mask_high;
5236   tree data_ref;
5237   tree perm3_mask_low, perm3_mask_high;
5238   unsigned int i, j, n, log_length = exact_log2 (length);
5239
5240   result_chain->quick_grow (length);
5241   memcpy (result_chain->address (), dr_chain.address (),
5242           length * sizeof (tree));
5243
5244   if (length == 3)
5245     {
5246       /* vect_grouped_store_supported ensures that this is constant.  */
5247       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5248       unsigned int j0 = 0, j1 = 0, j2 = 0;
5249
5250       vec_perm_builder sel (nelt, nelt, 1);
5251       sel.quick_grow (nelt);
5252       vec_perm_indices indices;
5253       for (j = 0; j < 3; j++)
5254         {
5255           int nelt0 = ((3 - j) * nelt) % 3;
5256           int nelt1 = ((3 - j) * nelt + 1) % 3;
5257           int nelt2 = ((3 - j) * nelt + 2) % 3;
5258
5259           for (i = 0; i < nelt; i++)
5260             {
5261               if (3 * i + nelt0 < nelt)
5262                 sel[3 * i + nelt0] = j0++;
5263               if (3 * i + nelt1 < nelt)
5264                 sel[3 * i + nelt1] = nelt + j1++;
5265               if (3 * i + nelt2 < nelt)
5266                 sel[3 * i + nelt2] = 0;
5267             }
5268           indices.new_vector (sel, 2, nelt);
5269           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5270
5271           for (i = 0; i < nelt; i++)
5272             {
5273               if (3 * i + nelt0 < nelt)
5274                 sel[3 * i + nelt0] = 3 * i + nelt0;
5275               if (3 * i + nelt1 < nelt)
5276                 sel[3 * i + nelt1] = 3 * i + nelt1;
5277               if (3 * i + nelt2 < nelt)
5278                 sel[3 * i + nelt2] = nelt + j2++;
5279             }
5280           indices.new_vector (sel, 2, nelt);
5281           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5282
5283           vect1 = dr_chain[0];
5284           vect2 = dr_chain[1];
5285
5286           /* Create interleaving stmt:
5287              low = VEC_PERM_EXPR <vect1, vect2,
5288                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5289                                    j + 2, nelt + j + 2, *, ...}>  */
5290           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5291           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5292                                            vect2, perm3_mask_low);
5293           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5294
5295           vect1 = data_ref;
5296           vect2 = dr_chain[2];
5297           /* Create interleaving stmt:
5298              low = VEC_PERM_EXPR <vect1, vect2,
5299                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5300                                    6, 7, nelt + j + 2, ...}>  */
5301           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5302           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5303                                            vect2, perm3_mask_high);
5304           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5305           (*result_chain)[j] = data_ref;
5306         }
5307     }
5308   else
5309     {
5310       /* If length is not equal to 3 then only power of 2 is supported.  */
5311       gcc_assert (pow2p_hwi (length));
5312
5313       /* The encoding has 2 interleaved stepped patterns.  */
5314       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5315       vec_perm_builder sel (nelt, 2, 3);
5316       sel.quick_grow (6);
5317       for (i = 0; i < 3; i++)
5318         {
5319           sel[i * 2] = i;
5320           sel[i * 2 + 1] = i + nelt;
5321         }
5322         vec_perm_indices indices (sel, 2, nelt);
5323         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5324
5325         for (i = 0; i < 6; i++)
5326           sel[i] += exact_div (nelt, 2);
5327         indices.new_vector (sel, 2, nelt);
5328         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5329
5330         for (i = 0, n = log_length; i < n; i++)
5331           {
5332             for (j = 0; j < length/2; j++)
5333               {
5334                 vect1 = dr_chain[j];
5335                 vect2 = dr_chain[j+length/2];
5336
5337                 /* Create interleaving stmt:
5338                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5339                                                         ...}>  */
5340                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5341                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5342                                                  vect2, perm_mask_high);
5343                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5344                 (*result_chain)[2*j] = high;
5345
5346                 /* Create interleaving stmt:
5347                    low = VEC_PERM_EXPR <vect1, vect2,
5348                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5349                                          ...}>  */
5350                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5351                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5352                                                  vect2, perm_mask_low);
5353                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354                 (*result_chain)[2*j+1] = low;
5355               }
5356             memcpy (dr_chain.address (), result_chain->address (),
5357                     length * sizeof (tree));
5358           }
5359     }
5360 }
5361
5362 /* Function vect_setup_realignment
5363
5364    This function is called when vectorizing an unaligned load using
5365    the dr_explicit_realign[_optimized] scheme.
5366    This function generates the following code at the loop prolog:
5367
5368       p = initial_addr;
5369    x  msq_init = *(floor(p));   # prolog load
5370       realignment_token = call target_builtin;
5371     loop:
5372    x  msq = phi (msq_init, ---)
5373
5374    The stmts marked with x are generated only for the case of
5375    dr_explicit_realign_optimized.
5376
5377    The code above sets up a new (vector) pointer, pointing to the first
5378    location accessed by STMT, and a "floor-aligned" load using that pointer.
5379    It also generates code to compute the "realignment-token" (if the relevant
5380    target hook was defined), and creates a phi-node at the loop-header bb
5381    whose arguments are the result of the prolog-load (created by this
5382    function) and the result of a load that takes place in the loop (to be
5383    created by the caller to this function).
5384
5385    For the case of dr_explicit_realign_optimized:
5386    The caller to this function uses the phi-result (msq) to create the
5387    realignment code inside the loop, and sets up the missing phi argument,
5388    as follows:
5389     loop:
5390       msq = phi (msq_init, lsq)
5391       lsq = *(floor(p'));        # load in loop
5392       result = realign_load (msq, lsq, realignment_token);
5393
5394    For the case of dr_explicit_realign:
5395     loop:
5396       msq = *(floor(p));        # load in loop
5397       p' = p + (VS-1);
5398       lsq = *(floor(p'));       # load in loop
5399       result = realign_load (msq, lsq, realignment_token);
5400
5401    Input:
5402    STMT - (scalar) load stmt to be vectorized. This load accesses
5403           a memory location that may be unaligned.
5404    BSI - place where new code is to be inserted.
5405    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5406                               is used.
5407
5408    Output:
5409    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5410                        target hook, if defined.
5411    Return value - the result of the loop-header phi node.  */
5412
5413 tree
5414 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5415                         tree *realignment_token,
5416                         enum dr_alignment_support alignment_support_scheme,
5417                         tree init_addr,
5418                         struct loop **at_loop)
5419 {
5420   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5421   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5422   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5423   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5424   struct loop *loop = NULL;
5425   edge pe = NULL;
5426   tree scalar_dest = gimple_assign_lhs (stmt);
5427   tree vec_dest;
5428   gimple *inc;
5429   tree ptr;
5430   tree data_ref;
5431   basic_block new_bb;
5432   tree msq_init = NULL_TREE;
5433   tree new_temp;
5434   gphi *phi_stmt;
5435   tree msq = NULL_TREE;
5436   gimple_seq stmts = NULL;
5437   bool inv_p;
5438   bool compute_in_loop = false;
5439   bool nested_in_vect_loop = false;
5440   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5441   struct loop *loop_for_initial_load = NULL;
5442
5443   if (loop_vinfo)
5444     {
5445       loop = LOOP_VINFO_LOOP (loop_vinfo);
5446       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5447     }
5448
5449   gcc_assert (alignment_support_scheme == dr_explicit_realign
5450               || alignment_support_scheme == dr_explicit_realign_optimized);
5451
5452   /* We need to generate three things:
5453      1. the misalignment computation
5454      2. the extra vector load (for the optimized realignment scheme).
5455      3. the phi node for the two vectors from which the realignment is
5456       done (for the optimized realignment scheme).  */
5457
5458   /* 1. Determine where to generate the misalignment computation.
5459
5460      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5461      calculation will be generated by this function, outside the loop (in the
5462      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5463      caller, inside the loop.
5464
5465      Background: If the misalignment remains fixed throughout the iterations of
5466      the loop, then both realignment schemes are applicable, and also the
5467      misalignment computation can be done outside LOOP.  This is because we are
5468      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5469      are a multiple of VS (the Vector Size), and therefore the misalignment in
5470      different vectorized LOOP iterations is always the same.
5471      The problem arises only if the memory access is in an inner-loop nested
5472      inside LOOP, which is now being vectorized using outer-loop vectorization.
5473      This is the only case when the misalignment of the memory access may not
5474      remain fixed throughout the iterations of the inner-loop (as explained in
5475      detail in vect_supportable_dr_alignment).  In this case, not only is the
5476      optimized realignment scheme not applicable, but also the misalignment
5477      computation (and generation of the realignment token that is passed to
5478      REALIGN_LOAD) have to be done inside the loop.
5479
5480      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5481      or not, which in turn determines if the misalignment is computed inside
5482      the inner-loop, or outside LOOP.  */
5483
5484   if (init_addr != NULL_TREE || !loop_vinfo)
5485     {
5486       compute_in_loop = true;
5487       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5488     }
5489
5490
5491   /* 2. Determine where to generate the extra vector load.
5492
5493      For the optimized realignment scheme, instead of generating two vector
5494      loads in each iteration, we generate a single extra vector load in the
5495      preheader of the loop, and in each iteration reuse the result of the
5496      vector load from the previous iteration.  In case the memory access is in
5497      an inner-loop nested inside LOOP, which is now being vectorized using
5498      outer-loop vectorization, we need to determine whether this initial vector
5499      load should be generated at the preheader of the inner-loop, or can be
5500      generated at the preheader of LOOP.  If the memory access has no evolution
5501      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5502      to be generated inside LOOP (in the preheader of the inner-loop).  */
5503
5504   if (nested_in_vect_loop)
5505     {
5506       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5507       bool invariant_in_outerloop =
5508             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5509       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5510     }
5511   else
5512     loop_for_initial_load = loop;
5513   if (at_loop)
5514     *at_loop = loop_for_initial_load;
5515
5516   if (loop_for_initial_load)
5517     pe = loop_preheader_edge (loop_for_initial_load);
5518
5519   /* 3. For the case of the optimized realignment, create the first vector
5520       load at the loop preheader.  */
5521
5522   if (alignment_support_scheme == dr_explicit_realign_optimized)
5523     {
5524       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5525       gassign *new_stmt;
5526
5527       gcc_assert (!compute_in_loop);
5528       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5529       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5530                                       NULL_TREE, &init_addr, NULL, &inc,
5531                                       true, &inv_p);
5532       if (TREE_CODE (ptr) == SSA_NAME)
5533         new_temp = copy_ssa_name (ptr);
5534       else
5535         new_temp = make_ssa_name (TREE_TYPE (ptr));
5536       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5537       new_stmt = gimple_build_assign
5538                    (new_temp, BIT_AND_EXPR, ptr,
5539                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5540       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5541       gcc_assert (!new_bb);
5542       data_ref
5543         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5544                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5545       vect_copy_ref_info (data_ref, DR_REF (dr));
5546       new_stmt = gimple_build_assign (vec_dest, data_ref);
5547       new_temp = make_ssa_name (vec_dest, new_stmt);
5548       gimple_assign_set_lhs (new_stmt, new_temp);
5549       if (pe)
5550         {
5551           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5552           gcc_assert (!new_bb);
5553         }
5554       else
5555          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5556
5557       msq_init = gimple_assign_lhs (new_stmt);
5558     }
5559
5560   /* 4. Create realignment token using a target builtin, if available.
5561       It is done either inside the containing loop, or before LOOP (as
5562       determined above).  */
5563
5564   if (targetm.vectorize.builtin_mask_for_load)
5565     {
5566       gcall *new_stmt;
5567       tree builtin_decl;
5568
5569       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5570       if (!init_addr)
5571         {
5572           /* Generate the INIT_ADDR computation outside LOOP.  */
5573           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5574                                                             NULL_TREE);
5575           if (loop)
5576             {
5577               pe = loop_preheader_edge (loop);
5578               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5579               gcc_assert (!new_bb);
5580             }
5581           else
5582              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5583         }
5584
5585       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5586       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5587       vec_dest =
5588         vect_create_destination_var (scalar_dest,
5589                                      gimple_call_return_type (new_stmt));
5590       new_temp = make_ssa_name (vec_dest, new_stmt);
5591       gimple_call_set_lhs (new_stmt, new_temp);
5592
5593       if (compute_in_loop)
5594         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5595       else
5596         {
5597           /* Generate the misalignment computation outside LOOP.  */
5598           pe = loop_preheader_edge (loop);
5599           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5600           gcc_assert (!new_bb);
5601         }
5602
5603       *realignment_token = gimple_call_lhs (new_stmt);
5604
5605       /* The result of the CALL_EXPR to this builtin is determined from
5606          the value of the parameter and no global variables are touched
5607          which makes the builtin a "const" function.  Requiring the
5608          builtin to have the "const" attribute makes it unnecessary
5609          to call mark_call_clobbered.  */
5610       gcc_assert (TREE_READONLY (builtin_decl));
5611     }
5612
5613   if (alignment_support_scheme == dr_explicit_realign)
5614     return msq;
5615
5616   gcc_assert (!compute_in_loop);
5617   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5618
5619
5620   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5621
5622   pe = loop_preheader_edge (containing_loop);
5623   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5624   msq = make_ssa_name (vec_dest);
5625   phi_stmt = create_phi_node (msq, containing_loop->header);
5626   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5627
5628   return msq;
5629 }
5630
5631
5632 /* Function vect_grouped_load_supported.
5633
5634    COUNT is the size of the load group (the number of statements plus the
5635    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5636    only one statement, with a gap of COUNT - 1.
5637
5638    Returns true if a suitable permute exists.  */
5639
5640 bool
5641 vect_grouped_load_supported (tree vectype, bool single_element_p,
5642                              unsigned HOST_WIDE_INT count)
5643 {
5644   machine_mode mode = TYPE_MODE (vectype);
5645
5646   /* If this is single-element interleaving with an element distance
5647      that leaves unused vector loads around punt - we at least create
5648      very sub-optimal code in that case (and blow up memory,
5649      see PR65518).  */
5650   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5651     {
5652       if (dump_enabled_p ())
5653         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654                          "single-element interleaving not supported "
5655                          "for not adjacent vector loads\n");
5656       return false;
5657     }
5658
5659   /* vect_permute_load_chain requires the group size to be equal to 3 or
5660      be a power of two.  */
5661   if (count != 3 && exact_log2 (count) == -1)
5662     {
5663       if (dump_enabled_p ())
5664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5665                          "the size of the group of accesses"
5666                          " is not a power of 2 or not equal to 3\n");
5667       return false;
5668     }
5669
5670   /* Check that the permutation is supported.  */
5671   if (VECTOR_MODE_P (mode))
5672     {
5673       unsigned int i, j;
5674       if (count == 3)
5675         {
5676           unsigned int nelt;
5677           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5678             {
5679               if (dump_enabled_p ())
5680                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5681                                  "cannot handle groups of 3 loads for"
5682                                  " variable-length vectors\n");
5683               return false;
5684             }
5685
5686           vec_perm_builder sel (nelt, nelt, 1);
5687           sel.quick_grow (nelt);
5688           vec_perm_indices indices;
5689           unsigned int k;
5690           for (k = 0; k < 3; k++)
5691             {
5692               for (i = 0; i < nelt; i++)
5693                 if (3 * i + k < 2 * nelt)
5694                   sel[i] = 3 * i + k;
5695                 else
5696                   sel[i] = 0;
5697               indices.new_vector (sel, 2, nelt);
5698               if (!can_vec_perm_const_p (mode, indices))
5699                 {
5700                   if (dump_enabled_p ())
5701                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702                                      "shuffle of 3 loads is not supported by"
5703                                      " target\n");
5704                   return false;
5705                 }
5706               for (i = 0, j = 0; i < nelt; i++)
5707                 if (3 * i + k < 2 * nelt)
5708                   sel[i] = i;
5709                 else
5710                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5711               indices.new_vector (sel, 2, nelt);
5712               if (!can_vec_perm_const_p (mode, indices))
5713                 {
5714                   if (dump_enabled_p ())
5715                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716                                      "shuffle of 3 loads is not supported by"
5717                                      " target\n");
5718                   return false;
5719                 }
5720             }
5721           return true;
5722         }
5723       else
5724         {
5725           /* If length is not equal to 3 then only power of 2 is supported.  */
5726           gcc_assert (pow2p_hwi (count));
5727           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5728
5729           /* The encoding has a single stepped pattern.  */
5730           vec_perm_builder sel (nelt, 1, 3);
5731           sel.quick_grow (3);
5732           for (i = 0; i < 3; i++)
5733             sel[i] = i * 2;
5734           vec_perm_indices indices (sel, 2, nelt);
5735           if (can_vec_perm_const_p (mode, indices))
5736             {
5737               for (i = 0; i < 3; i++)
5738                 sel[i] = i * 2 + 1;
5739               indices.new_vector (sel, 2, nelt);
5740               if (can_vec_perm_const_p (mode, indices))
5741                 return true;
5742             }
5743         }
5744     }
5745
5746   if (dump_enabled_p ())
5747     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5748                      "extract even/odd not supported by target\n");
5749   return false;
5750 }
5751
5752 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5753    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5754
5755 bool
5756 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5757                            bool masked_p)
5758 {
5759   if (masked_p)
5760     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5761                                          vec_mask_load_lanes_optab,
5762                                          vectype, count);
5763   else
5764     return vect_lanes_optab_supported_p ("vec_load_lanes",
5765                                          vec_load_lanes_optab,
5766                                          vectype, count);
5767 }
5768
5769 /* Function vect_permute_load_chain.
5770
5771    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5772    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5773    the input data correctly.  Return the final references for loads in
5774    RESULT_CHAIN.
5775
5776    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5777    The input is 4 vectors each containing 8 elements. We assign a number to each
5778    element, the input sequence is:
5779
5780    1st vec:   0  1  2  3  4  5  6  7
5781    2nd vec:   8  9 10 11 12 13 14 15
5782    3rd vec:  16 17 18 19 20 21 22 23
5783    4th vec:  24 25 26 27 28 29 30 31
5784
5785    The output sequence should be:
5786
5787    1st vec:  0 4  8 12 16 20 24 28
5788    2nd vec:  1 5  9 13 17 21 25 29
5789    3rd vec:  2 6 10 14 18 22 26 30
5790    4th vec:  3 7 11 15 19 23 27 31
5791
5792    i.e., the first output vector should contain the first elements of each
5793    interleaving group, etc.
5794
5795    We use extract_even/odd instructions to create such output.  The input of
5796    each extract_even/odd operation is two vectors
5797    1st vec    2nd vec
5798    0 1 2 3    4 5 6 7
5799
5800    and the output is the vector of extracted even/odd elements.  The output of
5801    extract_even will be:   0 2 4 6
5802    and of extract_odd:     1 3 5 7
5803
5804
5805    The permutation is done in log LENGTH stages.  In each stage extract_even
5806    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5807    their order.  In our example,
5808
5809    E1: extract_even (1st vec, 2nd vec)
5810    E2: extract_odd (1st vec, 2nd vec)
5811    E3: extract_even (3rd vec, 4th vec)
5812    E4: extract_odd (3rd vec, 4th vec)
5813
5814    The output for the first stage will be:
5815
5816    E1:  0  2  4  6  8 10 12 14
5817    E2:  1  3  5  7  9 11 13 15
5818    E3: 16 18 20 22 24 26 28 30
5819    E4: 17 19 21 23 25 27 29 31
5820
5821    In order to proceed and create the correct sequence for the next stage (or
5822    for the correct output, if the second stage is the last one, as in our
5823    example), we first put the output of extract_even operation and then the
5824    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5825    The input for the second stage is:
5826
5827    1st vec (E1):  0  2  4  6  8 10 12 14
5828    2nd vec (E3): 16 18 20 22 24 26 28 30
5829    3rd vec (E2):  1  3  5  7  9 11 13 15
5830    4th vec (E4): 17 19 21 23 25 27 29 31
5831
5832    The output of the second stage:
5833
5834    E1: 0 4  8 12 16 20 24 28
5835    E2: 2 6 10 14 18 22 26 30
5836    E3: 1 5  9 13 17 21 25 29
5837    E4: 3 7 11 15 19 23 27 31
5838
5839    And RESULT_CHAIN after reordering:
5840
5841    1st vec (E1):  0 4  8 12 16 20 24 28
5842    2nd vec (E3):  1 5  9 13 17 21 25 29
5843    3rd vec (E2):  2 6 10 14 18 22 26 30
5844    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5845
5846 static void
5847 vect_permute_load_chain (vec<tree> dr_chain,
5848                          unsigned int length,
5849                          gimple *stmt,
5850                          gimple_stmt_iterator *gsi,
5851                          vec<tree> *result_chain)
5852 {
5853   tree data_ref, first_vect, second_vect;
5854   tree perm_mask_even, perm_mask_odd;
5855   tree perm3_mask_low, perm3_mask_high;
5856   gimple *perm_stmt;
5857   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5858   unsigned int i, j, log_length = exact_log2 (length);
5859
5860   result_chain->quick_grow (length);
5861   memcpy (result_chain->address (), dr_chain.address (),
5862           length * sizeof (tree));
5863
5864   if (length == 3)
5865     {
5866       /* vect_grouped_load_supported ensures that this is constant.  */
5867       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5868       unsigned int k;
5869
5870       vec_perm_builder sel (nelt, nelt, 1);
5871       sel.quick_grow (nelt);
5872       vec_perm_indices indices;
5873       for (k = 0; k < 3; k++)
5874         {
5875           for (i = 0; i < nelt; i++)
5876             if (3 * i + k < 2 * nelt)
5877               sel[i] = 3 * i + k;
5878             else
5879               sel[i] = 0;
5880           indices.new_vector (sel, 2, nelt);
5881           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5882
5883           for (i = 0, j = 0; i < nelt; i++)
5884             if (3 * i + k < 2 * nelt)
5885               sel[i] = i;
5886             else
5887               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5888           indices.new_vector (sel, 2, nelt);
5889           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5890
5891           first_vect = dr_chain[0];
5892           second_vect = dr_chain[1];
5893
5894           /* Create interleaving stmt (low part of):
5895              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5896                                                              ...}>  */
5897           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5898           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5899                                            second_vect, perm3_mask_low);
5900           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5901
5902           /* Create interleaving stmt (high part of):
5903              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5904                                                               ...}>  */
5905           first_vect = data_ref;
5906           second_vect = dr_chain[2];
5907           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5908           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5909                                            second_vect, perm3_mask_high);
5910           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5911           (*result_chain)[k] = data_ref;
5912         }
5913     }
5914   else
5915     {
5916       /* If length is not equal to 3 then only power of 2 is supported.  */
5917       gcc_assert (pow2p_hwi (length));
5918
5919       /* The encoding has a single stepped pattern.  */
5920       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5921       vec_perm_builder sel (nelt, 1, 3);
5922       sel.quick_grow (3);
5923       for (i = 0; i < 3; ++i)
5924         sel[i] = i * 2;
5925       vec_perm_indices indices (sel, 2, nelt);
5926       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5927
5928       for (i = 0; i < 3; ++i)
5929         sel[i] = i * 2 + 1;
5930       indices.new_vector (sel, 2, nelt);
5931       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5932
5933       for (i = 0; i < log_length; i++)
5934         {
5935           for (j = 0; j < length; j += 2)
5936             {
5937               first_vect = dr_chain[j];
5938               second_vect = dr_chain[j+1];
5939
5940               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5941               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5942               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5943                                                first_vect, second_vect,
5944                                                perm_mask_even);
5945               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5946               (*result_chain)[j/2] = data_ref;
5947
5948               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5949               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5950               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5951                                                first_vect, second_vect,
5952                                                perm_mask_odd);
5953               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5954               (*result_chain)[j/2+length/2] = data_ref;
5955             }
5956           memcpy (dr_chain.address (), result_chain->address (),
5957                   length * sizeof (tree));
5958         }
5959     }
5960 }
5961
5962 /* Function vect_shift_permute_load_chain.
5963
5964    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5965    sequence of stmts to reorder the input data accordingly.
5966    Return the final references for loads in RESULT_CHAIN.
5967    Return true if successed, false otherwise.
5968
5969    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5970    The input is 3 vectors each containing 8 elements.  We assign a
5971    number to each element, the input sequence is:
5972
5973    1st vec:   0  1  2  3  4  5  6  7
5974    2nd vec:   8  9 10 11 12 13 14 15
5975    3rd vec:  16 17 18 19 20 21 22 23
5976
5977    The output sequence should be:
5978
5979    1st vec:  0 3 6  9 12 15 18 21
5980    2nd vec:  1 4 7 10 13 16 19 22
5981    3rd vec:  2 5 8 11 14 17 20 23
5982
5983    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5984
5985    First we shuffle all 3 vectors to get correct elements order:
5986
5987    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5988    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5989    3rd vec:  (16 19 22) (17 20 23) (18 21)
5990
5991    Next we unite and shift vector 3 times:
5992
5993    1st step:
5994      shift right by 6 the concatenation of:
5995      "1st vec" and  "2nd vec"
5996        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5997      "2nd vec" and  "3rd vec"
5998        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5999      "3rd vec" and  "1st vec"
6000        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6001                              | New vectors                   |
6002
6003      So that now new vectors are:
6004
6005      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6006      2nd vec:  (10 13) (16 19 22) (17 20 23)
6007      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6008
6009    2nd step:
6010      shift right by 5 the concatenation of:
6011      "1st vec" and  "3rd vec"
6012        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6013      "2nd vec" and  "1st vec"
6014        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6015      "3rd vec" and  "2nd vec"
6016        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6017                           | New vectors                   |
6018
6019      So that now new vectors are:
6020
6021      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6022      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6023      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6024
6025    3rd step:
6026      shift right by 5 the concatenation of:
6027      "1st vec" and  "1st vec"
6028        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6029      shift right by 3 the concatenation of:
6030      "2nd vec" and  "2nd vec"
6031                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6032                           | New vectors                   |
6033
6034      So that now all vectors are READY:
6035      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6036      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6037      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6038
6039    This algorithm is faster than one in vect_permute_load_chain if:
6040      1.  "shift of a concatination" is faster than general permutation.
6041          This is usually so.
6042      2.  The TARGET machine can't execute vector instructions in parallel.
6043          This is because each step of the algorithm depends on previous.
6044          The algorithm in vect_permute_load_chain is much more parallel.
6045
6046    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6047 */
6048
6049 static bool
6050 vect_shift_permute_load_chain (vec<tree> dr_chain,
6051                                unsigned int length,
6052                                gimple *stmt,
6053                                gimple_stmt_iterator *gsi,
6054                                vec<tree> *result_chain)
6055 {
6056   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6057   tree perm2_mask1, perm2_mask2, perm3_mask;
6058   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6059   gimple *perm_stmt;
6060
6061   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6062   unsigned int i;
6063   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6064   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6065
6066   unsigned HOST_WIDE_INT nelt, vf;
6067   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6068       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6069     /* Not supported for variable-length vectors.  */
6070     return false;
6071
6072   vec_perm_builder sel (nelt, nelt, 1);
6073   sel.quick_grow (nelt);
6074
6075   result_chain->quick_grow (length);
6076   memcpy (result_chain->address (), dr_chain.address (),
6077           length * sizeof (tree));
6078
6079   if (pow2p_hwi (length) && vf > 4)
6080     {
6081       unsigned int j, log_length = exact_log2 (length);
6082       for (i = 0; i < nelt / 2; ++i)
6083         sel[i] = i * 2;
6084       for (i = 0; i < nelt / 2; ++i)
6085         sel[nelt / 2 + i] = i * 2 + 1;
6086       vec_perm_indices indices (sel, 2, nelt);
6087       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6088         {
6089           if (dump_enabled_p ())
6090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6091                              "shuffle of 2 fields structure is not \
6092                               supported by target\n");
6093           return false;
6094         }
6095       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6096
6097       for (i = 0; i < nelt / 2; ++i)
6098         sel[i] = i * 2 + 1;
6099       for (i = 0; i < nelt / 2; ++i)
6100         sel[nelt / 2 + i] = i * 2;
6101       indices.new_vector (sel, 2, nelt);
6102       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6103         {
6104           if (dump_enabled_p ())
6105             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106                              "shuffle of 2 fields structure is not \
6107                               supported by target\n");
6108           return false;
6109         }
6110       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6111
6112       /* Generating permutation constant to shift all elements.
6113          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6114       for (i = 0; i < nelt; i++)
6115         sel[i] = nelt / 2 + i;
6116       indices.new_vector (sel, 2, nelt);
6117       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6118         {
6119           if (dump_enabled_p ())
6120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6121                              "shift permutation is not supported by target\n");
6122           return false;
6123         }
6124       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6125
6126       /* Generating permutation constant to select vector from 2.
6127          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6128       for (i = 0; i < nelt / 2; i++)
6129         sel[i] = i;
6130       for (i = nelt / 2; i < nelt; i++)
6131         sel[i] = nelt + i;
6132       indices.new_vector (sel, 2, nelt);
6133       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6134         {
6135           if (dump_enabled_p ())
6136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137                              "select is not supported by target\n");
6138           return false;
6139         }
6140       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6141
6142       for (i = 0; i < log_length; i++)
6143         {
6144           for (j = 0; j < length; j += 2)
6145             {
6146               first_vect = dr_chain[j];
6147               second_vect = dr_chain[j + 1];
6148
6149               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6150               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6151                                                first_vect, first_vect,
6152                                                perm2_mask1);
6153               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6154               vect[0] = data_ref;
6155
6156               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6157               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6158                                                second_vect, second_vect,
6159                                                perm2_mask2);
6160               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6161               vect[1] = data_ref;
6162
6163               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6164               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6165                                                vect[0], vect[1], shift1_mask);
6166               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6167               (*result_chain)[j/2 + length/2] = data_ref;
6168
6169               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6170               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6171                                                vect[0], vect[1], select_mask);
6172               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6173               (*result_chain)[j/2] = data_ref;
6174             }
6175           memcpy (dr_chain.address (), result_chain->address (),
6176                   length * sizeof (tree));
6177         }
6178       return true;
6179     }
6180   if (length == 3 && vf > 2)
6181     {
6182       unsigned int k = 0, l = 0;
6183
6184       /* Generating permutation constant to get all elements in rigth order.
6185          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6186       for (i = 0; i < nelt; i++)
6187         {
6188           if (3 * k + (l % 3) >= nelt)
6189             {
6190               k = 0;
6191               l += (3 - (nelt % 3));
6192             }
6193           sel[i] = 3 * k + (l % 3);
6194           k++;
6195         }
6196       vec_perm_indices indices (sel, 2, nelt);
6197       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6198         {
6199           if (dump_enabled_p ())
6200             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201                              "shuffle of 3 fields structure is not \
6202                               supported by target\n");
6203           return false;
6204         }
6205       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6206
6207       /* Generating permutation constant to shift all elements.
6208          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6209       for (i = 0; i < nelt; i++)
6210         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6211       indices.new_vector (sel, 2, nelt);
6212       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6213         {
6214           if (dump_enabled_p ())
6215             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6216                              "shift permutation is not supported by target\n");
6217           return false;
6218         }
6219       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6220
6221       /* Generating permutation constant to shift all elements.
6222          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6223       for (i = 0; i < nelt; i++)
6224         sel[i] = 2 * (nelt / 3) + 1 + i;
6225       indices.new_vector (sel, 2, nelt);
6226       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6227         {
6228           if (dump_enabled_p ())
6229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230                              "shift permutation is not supported by target\n");
6231           return false;
6232         }
6233       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6234
6235       /* Generating permutation constant to shift all elements.
6236          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6237       for (i = 0; i < nelt; i++)
6238         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6239       indices.new_vector (sel, 2, nelt);
6240       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241         {
6242           if (dump_enabled_p ())
6243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244                              "shift permutation is not supported by target\n");
6245           return false;
6246         }
6247       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6248
6249       /* Generating permutation constant to shift all elements.
6250          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6251       for (i = 0; i < nelt; i++)
6252         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6253       indices.new_vector (sel, 2, nelt);
6254       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6255         {
6256           if (dump_enabled_p ())
6257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258                              "shift permutation is not supported by target\n");
6259           return false;
6260         }
6261       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6262
6263       for (k = 0; k < 3; k++)
6264         {
6265           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6266           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6267                                            dr_chain[k], dr_chain[k],
6268                                            perm3_mask);
6269           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6270           vect[k] = data_ref;
6271         }
6272
6273       for (k = 0; k < 3; k++)
6274         {
6275           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6276           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6277                                            vect[k % 3], vect[(k + 1) % 3],
6278                                            shift1_mask);
6279           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6280           vect_shift[k] = data_ref;
6281         }
6282
6283       for (k = 0; k < 3; k++)
6284         {
6285           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6286           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6287                                            vect_shift[(4 - k) % 3],
6288                                            vect_shift[(3 - k) % 3],
6289                                            shift2_mask);
6290           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6291           vect[k] = data_ref;
6292         }
6293
6294       (*result_chain)[3 - (nelt % 3)] = vect[2];
6295
6296       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6297       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6298                                        vect[0], shift3_mask);
6299       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6300       (*result_chain)[nelt % 3] = data_ref;
6301
6302       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6303       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6304                                        vect[1], shift4_mask);
6305       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6306       (*result_chain)[0] = data_ref;
6307       return true;
6308     }
6309   return false;
6310 }
6311
6312 /* Function vect_transform_grouped_load.
6313
6314    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6315    to perform their permutation and ascribe the result vectorized statements to
6316    the scalar statements.
6317 */
6318
6319 void
6320 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6321                              gimple_stmt_iterator *gsi)
6322 {
6323   machine_mode mode;
6324   vec<tree> result_chain = vNULL;
6325
6326   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6327      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6328      vectors, that are ready for vector computation.  */
6329   result_chain.create (size);
6330
6331   /* If reassociation width for vector type is 2 or greater target machine can
6332      execute 2 or more vector instructions in parallel.  Otherwise try to
6333      get chain for loads group using vect_shift_permute_load_chain.  */
6334   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6335   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6336       || pow2p_hwi (size)
6337       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6338                                          gsi, &result_chain))
6339     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6340   vect_record_grouped_load_vectors (stmt, result_chain);
6341   result_chain.release ();
6342 }
6343
6344 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6345    generated as part of the vectorization of STMT.  Assign the statement
6346    for each vector to the associated scalar statement.  */
6347
6348 void
6349 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6350 {
6351   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6352   gimple *next_stmt, *new_stmt;
6353   unsigned int i, gap_count;
6354   tree tmp_data_ref;
6355
6356   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6357      Since we scan the chain starting from it's first node, their order
6358      corresponds the order of data-refs in RESULT_CHAIN.  */
6359   next_stmt = first_stmt;
6360   gap_count = 1;
6361   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6362     {
6363       if (!next_stmt)
6364         break;
6365
6366       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6367        code elimination pass later.  No need to check for the first stmt in
6368        the group, since it always exists.
6369        DR_GROUP_GAP is the number of steps in elements from the previous
6370        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6371        correspond to the gaps.  */
6372       if (next_stmt != first_stmt
6373           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6374       {
6375         gap_count++;
6376         continue;
6377       }
6378
6379       while (next_stmt)
6380         {
6381           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6382           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6383              copies, and we put the new vector statement in the first available
6384              RELATED_STMT.  */
6385           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6386             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6387           else
6388             {
6389               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6390                 {
6391                   gimple *prev_stmt =
6392                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6393                   gimple *rel_stmt =
6394                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6395                   while (rel_stmt)
6396                     {
6397                       prev_stmt = rel_stmt;
6398                       rel_stmt =
6399                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6400                     }
6401
6402                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6403                     new_stmt;
6404                 }
6405             }
6406
6407           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6408           gap_count = 1;
6409           /* If NEXT_STMT accesses the same DR as the previous statement,
6410              put the same TMP_DATA_REF as its vectorized statement; otherwise
6411              get the next data-ref from RESULT_CHAIN.  */
6412           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6413             break;
6414         }
6415     }
6416 }
6417
6418 /* Function vect_force_dr_alignment_p.
6419
6420    Returns whether the alignment of a DECL can be forced to be aligned
6421    on ALIGNMENT bit boundary.  */
6422
6423 bool
6424 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6425 {
6426   if (!VAR_P (decl))
6427     return false;
6428
6429   if (decl_in_symtab_p (decl)
6430       && !symtab_node::get (decl)->can_increase_alignment_p ())
6431     return false;
6432
6433   if (TREE_STATIC (decl))
6434     return (alignment <= MAX_OFILE_ALIGNMENT);
6435   else
6436     return (alignment <= MAX_STACK_ALIGNMENT);
6437 }
6438
6439
6440 /* Return whether the data reference DR is supported with respect to its
6441    alignment.
6442    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6443    it is aligned, i.e., check if it is possible to vectorize it with different
6444    alignment.  */
6445
6446 enum dr_alignment_support
6447 vect_supportable_dr_alignment (struct data_reference *dr,
6448                                bool check_aligned_accesses)
6449 {
6450   gimple *stmt = vect_dr_stmt (dr);
6451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6452   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6453   machine_mode mode = TYPE_MODE (vectype);
6454   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6455   struct loop *vect_loop = NULL;
6456   bool nested_in_vect_loop = false;
6457
6458   if (aligned_access_p (dr) && !check_aligned_accesses)
6459     return dr_aligned;
6460
6461   /* For now assume all conditional loads/stores support unaligned
6462      access without any special code.  */
6463   if (is_gimple_call (stmt)
6464       && gimple_call_internal_p (stmt)
6465       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6466           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6467     return dr_unaligned_supported;
6468
6469   if (loop_vinfo)
6470     {
6471       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6472       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6473     }
6474
6475   /* Possibly unaligned access.  */
6476
6477   /* We can choose between using the implicit realignment scheme (generating
6478      a misaligned_move stmt) and the explicit realignment scheme (generating
6479      aligned loads with a REALIGN_LOAD).  There are two variants to the
6480      explicit realignment scheme: optimized, and unoptimized.
6481      We can optimize the realignment only if the step between consecutive
6482      vector loads is equal to the vector size.  Since the vector memory
6483      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6484      is guaranteed that the misalignment amount remains the same throughout the
6485      execution of the vectorized loop.  Therefore, we can create the
6486      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6487      at the loop preheader.
6488
6489      However, in the case of outer-loop vectorization, when vectorizing a
6490      memory access in the inner-loop nested within the LOOP that is now being
6491      vectorized, while it is guaranteed that the misalignment of the
6492      vectorized memory access will remain the same in different outer-loop
6493      iterations, it is *not* guaranteed that is will remain the same throughout
6494      the execution of the inner-loop.  This is because the inner-loop advances
6495      with the original scalar step (and not in steps of VS).  If the inner-loop
6496      step happens to be a multiple of VS, then the misalignment remains fixed
6497      and we can use the optimized realignment scheme.  For example:
6498
6499       for (i=0; i<N; i++)
6500         for (j=0; j<M; j++)
6501           s += a[i+j];
6502
6503      When vectorizing the i-loop in the above example, the step between
6504      consecutive vector loads is 1, and so the misalignment does not remain
6505      fixed across the execution of the inner-loop, and the realignment cannot
6506      be optimized (as illustrated in the following pseudo vectorized loop):
6507
6508       for (i=0; i<N; i+=4)
6509         for (j=0; j<M; j++){
6510           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6511                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6512                          // (assuming that we start from an aligned address).
6513           }
6514
6515      We therefore have to use the unoptimized realignment scheme:
6516
6517       for (i=0; i<N; i+=4)
6518           for (j=k; j<M; j+=4)
6519           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6520                            // that the misalignment of the initial address is
6521                            // 0).
6522
6523      The loop can then be vectorized as follows:
6524
6525       for (k=0; k<4; k++){
6526         rt = get_realignment_token (&vp[k]);
6527         for (i=0; i<N; i+=4){
6528           v1 = vp[i+k];
6529           for (j=k; j<M; j+=4){
6530             v2 = vp[i+j+VS-1];
6531             va = REALIGN_LOAD <v1,v2,rt>;
6532             vs += va;
6533             v1 = v2;
6534           }
6535         }
6536     } */
6537
6538   if (DR_IS_READ (dr))
6539     {
6540       bool is_packed = false;
6541       tree type = (TREE_TYPE (DR_REF (dr)));
6542
6543       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6544           && (!targetm.vectorize.builtin_mask_for_load
6545               || targetm.vectorize.builtin_mask_for_load ()))
6546         {
6547           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6548
6549           /* If we are doing SLP then the accesses need not have the
6550              same alignment, instead it depends on the SLP group size.  */
6551           if (loop_vinfo
6552               && STMT_SLP_TYPE (stmt_info)
6553               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6554                               * DR_GROUP_SIZE (vinfo_for_stmt
6555                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6556                               TYPE_VECTOR_SUBPARTS (vectype)))
6557             ;
6558           else if (!loop_vinfo
6559                    || (nested_in_vect_loop
6560                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6561                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6562             return dr_explicit_realign;
6563           else
6564             return dr_explicit_realign_optimized;
6565         }
6566       if (!known_alignment_for_access_p (dr))
6567         is_packed = not_size_aligned (DR_REF (dr));
6568
6569       if (targetm.vectorize.support_vector_misalignment
6570             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6571         /* Can't software pipeline the loads, but can at least do them.  */
6572         return dr_unaligned_supported;
6573     }
6574   else
6575     {
6576       bool is_packed = false;
6577       tree type = (TREE_TYPE (DR_REF (dr)));
6578
6579       if (!known_alignment_for_access_p (dr))
6580         is_packed = not_size_aligned (DR_REF (dr));
6581
6582      if (targetm.vectorize.support_vector_misalignment
6583            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6584        return dr_unaligned_supported;
6585     }
6586
6587   /* Unsupported.  */
6588   return dr_unaligned_unsupported;
6589 }