gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return (!STMT_VINFO_GATHER_SCATTER_P
 271                      (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
 272                   && !STMT_VINFO_GATHER_SCATTER_P
 273                         (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
 298   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
 299   lambda_vector dist_v;
 300   unsigned int loop_depth;
 301
 302   /* In loop analysis all data references should be vectorizable.  */
 303   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 304       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 305     gcc_unreachable ();
 306
 307   /* Independent data accesses.  */
 308   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 309     return false;
 310
 311   if (dra == drb
 312       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 313     return false;
 314
 315   /* We do not have to consider dependences between accesses that belong
 316      to the same group, unless the stride could be smaller than the
 317      group size.  */
 318   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 319       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 321       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 322     return false;
 323
 324   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 325      least two scalar iterations, there is always also a true dependence.
 326      As the vectorizer does not re-order loads and stores we can ignore
 327      the anti-dependence if TBAA can disambiguate both DRs similar to the
 328      case with known negative distance anti-dependences (positive
 329      distance anti-dependences would violate TBAA constraints).  */
 330   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 331        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 332       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 333                                  get_alias_set (DR_REF (drb))))
 334     return false;
 335
 336   /* Unknown data dependence.  */
 337   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 338     {
 339       /* If user asserted safelen consecutive iterations can be
 340          executed concurrently, assume independence.  */
 341       if (loop->safelen >= 2)
 342         {
 343           if ((unsigned int) loop->safelen < *max_vf)
 344             *max_vf = loop->safelen;
 345           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 346           return false;
 347         }
 348
 349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 350           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                                "versioning for alias not supported for: "
 356                                "can't determine dependence between ");
 357               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 358                                  DR_REF (dra));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 360               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 361                                  DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364           return true;
 365         }
 366
 367       if (dump_enabled_p ())
 368         {
 369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                            "versioning for alias required: "
 371                            "can't determine dependence between ");
 372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 373                              DR_REF (dra));
 374           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 375           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 376                              DR_REF (drb));
 377           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 378         }
 379
 380       /* Add to list of ddrs that need to be tested at run-time.  */
 381       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 382     }
 383
 384   /* Known data dependence.  */
 385   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 386     {
 387       /* If user asserted safelen consecutive iterations can be
 388          executed concurrently, assume independence.  */
 389       if (loop->safelen >= 2)
 390         {
 391           if ((unsigned int) loop->safelen < *max_vf)
 392             *max_vf = loop->safelen;
 393           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 394           return false;
 395         }
 396
 397       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 398           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 399         {
 400           if (dump_enabled_p ())
 401             {
 402               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                "versioning for alias not supported for: "
 404                                "bad dist vector for ");
 405               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                  DR_REF (dra));
 407               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 408               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 409                                  DR_REF (drb));
 410               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411             }
 412           return true;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                            "versioning for alias required: "
 419                            "bad dist vector for ");
 420           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 423           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 424         }
 425       /* Add to list of ddrs that need to be tested at run-time.  */
 426       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 427     }
 428
 429   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 430
 431   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 432       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 433                                                 loop_depth, max_vf))
 434     return false;
 435
 436   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 437     {
 438       int dist = dist_v[loop_depth];
 439
 440       if (dump_enabled_p ())
 441         dump_printf_loc (MSG_NOTE, vect_location,
 442                          "dependence distance  = %d.\n", dist);
 443
 444       if (dist == 0)
 445         {
 446           if (dump_enabled_p ())
 447             {
 448               dump_printf_loc (MSG_NOTE, vect_location,
 449                                "dependence distance == 0 between ");
 450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 451               dump_printf (MSG_NOTE, " and ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 453               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 454             }
 455
 456           /* When we perform grouped accesses and perform implicit CSE
 457              by detecting equal accesses and doing disambiguation with
 458              runtime alias tests like for
 459                 .. = a[i];
 460                 .. = a[i+1];
 461                 a[i] = ..;
 462                 a[i+1] = ..;
 463                 *p = ..;
 464                 .. = a[i];
 465                 .. = a[i+1];
 466              where we will end up loading { a[i], a[i+1] } once, make
 467              sure that inserting group loads before the first load and
 468              stores after the last store will do the right thing.
 469              Similar for groups like
 470                 a[i] = ...;
 471                 ... = a[i];
 472                 a[i+1] = ...;
 473              where loads from the group interleave with the store.  */
 474           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 475                                               vect_dr_stmt (drb)))
 476             {
 477               if (dump_enabled_p ())
 478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 479                                  "READ_WRITE dependence in interleaving.\n");
 480               return true;
 481             }
 482
 483           if (loop->safelen < 2)
 484             {
 485               tree indicator = dr_zero_step_indicator (dra);
 486               if (!indicator || integer_zerop (indicator))
 487                 {
 488                   if (dump_enabled_p ())
 489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                                  "access also has a zero step\n");
 491                   return true;
 492                 }
 493               else if (TREE_CODE (indicator) != INTEGER_CST)
 494                 vect_check_nonzero_value (loop_vinfo, indicator);
 495             }
 496           continue;
 497         }
 498
 499       if (dist > 0 && DDR_REVERSED_P (ddr))
 500         {
 501           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 502              reversed (to make distance vector positive), and the actual
 503              distance is negative.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 506                              "dependence distance negative.\n");
 507           /* Record a negative dependence distance to later limit the
 508              amount of stmt copying / unrolling we can perform.
 509              Only need to handle read-after-write dependence.  */
 510           if (DR_IS_READ (drb)
 511               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 512                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 513             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 514           continue;
 515         }
 516
 517       unsigned int abs_dist = abs (dist);
 518       if (abs_dist >= 2 && abs_dist < *max_vf)
 519         {
 520           /* The dependence distance requires reduction of the maximal
 521              vectorization factor.  */
 522           *max_vf = abs (dist);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "adjusting maximal vectorization factor to %i\n",
 526                              *max_vf);
 527         }
 528
 529       if (abs_dist >= *max_vf)
 530         {
 531           /* Dependence distance does not create dependence, as far as
 532              vectorization is concerned, in this case.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance >= VF.\n");
 536           continue;
 537         }
 538
 539       if (dump_enabled_p ())
 540         {
 541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 542                        "not vectorized, possible dependence "
 543                        "between data-refs ");
 544           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_NOTE,  " and ");
 546           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_NOTE,  "\n");
 548         }
 549
 550       return true;
 551     }
 552
 553   return false;
 554 }
 555
 556 /* Function vect_analyze_data_ref_dependences.
 557
 558    Examine all the data references in the loop, and make sure there do not
 559    exist any data dependences between them.  Set *MAX_VF according to
 560    the maximum vectorization factor the data dependences allow.  */
 561
 562 bool
 563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 564                                    unsigned int *max_vf)
 565 {
 566   unsigned int i;
 567   struct data_dependence_relation *ddr;
 568
 569   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 570
 571   LOOP_VINFO_DDRS (loop_vinfo)
 572     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 573              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 574   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 575   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 576   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 577                                 &LOOP_VINFO_DDRS (loop_vinfo),
 578                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 579     return false;
 580
 581   /* For epilogues we either have no aliases or alias versioning
 582      was applied to original loop.  Therefore we may just get max_vf
 583      using VF of original loop.  */
 584   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 585     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 586   else
 587     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 588       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 589         return false;
 590
 591   return true;
 592 }
 593
 594
 595 /* Function vect_slp_analyze_data_ref_dependence.
 596
 597    Return TRUE if there (might) exist a dependence between a memory-reference
 598    DRA and a memory-reference DRB.  When versioning for alias may check a
 599    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 600    the data dependence.  */
 601
 602 static bool
 603 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 604 {
 605   struct data_reference *dra = DDR_A (ddr);
 606   struct data_reference *drb = DDR_B (ddr);
 607
 608   /* We need to check dependences of statements marked as unvectorizable
 609      as well, they still can prohibit vectorization.  */
 610
 611   /* Independent data accesses.  */
 612   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 613     return false;
 614
 615   if (dra == drb)
 616     return false;
 617
 618   /* Read-read is OK.  */
 619   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 620     return false;
 621
 622   /* If dra and drb are part of the same interleaving chain consider
 623      them independent.  */
 624   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
 625       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
 626           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
 627     return false;
 628
 629   /* Unknown data dependence.  */
 630   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 631     {
 632       if  (dump_enabled_p ())
 633         {
 634           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 635                            "can't determine dependence between ");
 636           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 637           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 638           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 639           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 640         }
 641     }
 642   else if (dump_enabled_p ())
 643     {
 644       dump_printf_loc (MSG_NOTE, vect_location,
 645                        "determined dependence between ");
 646       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 647       dump_printf (MSG_NOTE, " and ");
 648       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 649       dump_printf (MSG_NOTE,  "\n");
 650     }
 651
 652   return true;
 653 }
 654
 655
 656 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 657    contain the vector of scalar stores of this instance if we are
 658    disambiguating the loads.  */
 659
 660 static bool
 661 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 662                                    vec<gimple *> stores, gimple *last_store)
 663 {
 664   /* This walks over all stmts involved in the SLP load/store done
 665      in NODE verifying we can sink them up to the last stmt in the
 666      group.  */
 667   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 668   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 669     {
 670       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 671       if (access == last_access)
 672         continue;
 673       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 674       ao_ref ref;
 675       bool ref_initialized_p = false;
 676       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 677            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 678         {
 679           gimple *stmt = gsi_stmt (gsi);
 680           if (! gimple_vuse (stmt)
 681               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 682             continue;
 683
 684           /* If we couldn't record a (single) data reference for this
 685              stmt we have to resort to the alias oracle.  */
 686           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 687           if (!dr_b)
 688             {
 689               /* We are moving a store or sinking a load - this means
 690                  we cannot use TBAA for disambiguation.  */
 691               if (!ref_initialized_p)
 692                 ao_ref_init (&ref, DR_REF (dr_a));
 693               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 694                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 695                 return false;
 696               continue;
 697             }
 698
 699           bool dependent = false;
 700           /* If we run into a store of this same instance (we've just
 701              marked those) then delay dependence checking until we run
 702              into the last store because this is where it will have
 703              been sunk to (and we verify if we can do that as well).  */
 704           if (gimple_visited_p (stmt))
 705             {
 706               if (stmt != last_store)
 707                 continue;
 708               unsigned i;
 709               gimple *store;
 710               FOR_EACH_VEC_ELT (stores, i, store)
 711                 {
 712                   data_reference *store_dr
 713                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 714                   ddr_p ddr = initialize_data_dependence_relation
 715                                 (dr_a, store_dr, vNULL);
 716                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 717                   free_dependence_relation (ddr);
 718                   if (dependent)
 719                     break;
 720                 }
 721             }
 722           else
 723             {
 724               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 725                                                                dr_b, vNULL);
 726               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 727               free_dependence_relation (ddr);
 728             }
 729           if (dependent)
 730             return false;
 731         }
 732     }
 733   return true;
 734 }
 735
 736
 737 /* Function vect_analyze_data_ref_dependences.
 738
 739    Examine all the data references in the basic-block, and make sure there
 740    do not exist any data dependences between them.  Set *MAX_VF according to
 741    the maximum vectorization factor the data dependences allow.  */
 742
 743 bool
 744 vect_slp_analyze_instance_dependence (slp_instance instance)
 745 {
 746   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 747
 748   /* The stores of this instance are at the root of the SLP tree.  */
 749   slp_tree store = SLP_INSTANCE_TREE (instance);
 750   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 751     store = NULL;
 752
 753   /* Verify we can sink stores to the vectorized stmt insert location.  */
 754   gimple *last_store = NULL;
 755   if (store)
 756     {
 757       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 758         return false;
 759
 760       /* Mark stores in this instance and remember the last one.  */
 761       last_store = vect_find_last_scalar_stmt_in_slp (store);
 762       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 763         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 764     }
 765
 766   bool res = true;
 767
 768   /* Verify we can sink loads to the vectorized stmt insert location,
 769      special-casing stores of this instance.  */
 770   slp_tree load;
 771   unsigned int i;
 772   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 773     if (! vect_slp_analyze_node_dependences (instance, load,
 774                                              store
 775                                              ? SLP_TREE_SCALAR_STMTS (store)
 776                                              : vNULL, last_store))
 777       {
 778         res = false;
 779         break;
 780       }
 781
 782   /* Unset the visited flag.  */
 783   if (store)
 784     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 785       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 786
 787   return res;
 788 }
 789
 790 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 791    the statement that contains DRB, which is useful for recording in the
 792    dump file.  */
 793
 794 static void
 795 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 796                             innermost_loop_behavior *drb)
 797 {
 798   bool existed;
 799   innermost_loop_behavior *&entry
 800     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 801   if (!existed || entry->base_alignment < drb->base_alignment)
 802     {
 803       entry = drb;
 804       if (dump_enabled_p ())
 805         {
 806           dump_printf_loc (MSG_NOTE, vect_location,
 807                            "recording new base alignment for ");
 808           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 809           dump_printf (MSG_NOTE, "\n");
 810           dump_printf_loc (MSG_NOTE, vect_location,
 811                            "  alignment:    %d\n", drb->base_alignment);
 812           dump_printf_loc (MSG_NOTE, vect_location,
 813                            "  misalignment: %d\n", drb->base_misalignment);
 814           dump_printf_loc (MSG_NOTE, vect_location,
 815                            "  based on:     ");
 816           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 817         }
 818     }
 819 }
 820
 821 /* If the region we're going to vectorize is reached, all unconditional
 822    data references occur at least once.  We can therefore pool the base
 823    alignment guarantees from each unconditional reference.  Do this by
 824    going through all the data references in VINFO and checking whether
 825    the containing statement makes the reference unconditionally.  If so,
 826    record the alignment of the base address in VINFO so that it can be
 827    used for all other references with the same base.  */
 828
 829 void
 830 vect_record_base_alignments (vec_info *vinfo)
 831 {
 832   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 833   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 834   data_reference *dr;
 835   unsigned int i;
 836   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 837     {
 838       gimple *stmt = vect_dr_stmt (dr);
 839       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 840       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 841           && STMT_VINFO_VECTORIZABLE (stmt_info)
 842           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 843         {
 844           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 845
 846           /* If DR is nested in the loop that is being vectorized, we can also
 847              record the alignment of the base wrt the outer loop.  */
 848           if (loop && nested_in_vect_loop_p (loop, stmt))
 849             vect_record_base_alignment
 850                 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 851         }
 852     }
 853 }
 854
 855 /* Return the target alignment for the vectorized form of DR.  */
 856
 857 static unsigned int
 858 vect_calculate_target_alignment (struct data_reference *dr)
 859 {
 860   gimple *stmt = vect_dr_stmt (dr);
 861   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 862   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 863   return targetm.vectorize.preferred_vector_alignment (vectype);
 864 }
 865
 866 /* Function vect_compute_data_ref_alignment
 867
 868    Compute the misalignment of the data reference DR.
 869
 870    Output:
 871    1. DR_MISALIGNMENT (DR) is defined.
 872
 873    FOR NOW: No analysis is actually performed. Misalignment is calculated
 874    only for trivial cases. TODO.  */
 875
 876 static void
 877 vect_compute_data_ref_alignment (struct data_reference *dr)
 878 {
 879   gimple *stmt = vect_dr_stmt (dr);
 880   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 881   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 882   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 883   struct loop *loop = NULL;
 884   tree ref = DR_REF (dr);
 885   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 886
 887   if (dump_enabled_p ())
 888     dump_printf_loc (MSG_NOTE, vect_location,
 889                      "vect_compute_data_ref_alignment:\n");
 890
 891   if (loop_vinfo)
 892     loop = LOOP_VINFO_LOOP (loop_vinfo);
 893
 894   /* Initialize misalignment to unknown.  */
 895   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 896
 897   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 898     return;
 899
 900   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 901   bool step_preserves_misalignment_p;
 902
 903   unsigned HOST_WIDE_INT vector_alignment
 904     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 905   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 906
 907   /* No step for BB vectorization.  */
 908   if (!loop)
 909     {
 910       gcc_assert (integer_zerop (drb->step));
 911       step_preserves_misalignment_p = true;
 912     }
 913
 914   /* In case the dataref is in an inner-loop of the loop that is being
 915      vectorized (LOOP), we use the base and misalignment information
 916      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 917      stays the same throughout the execution of the inner-loop, which is why
 918      we have to check that the stride of the dataref in the inner-loop evenly
 919      divides by the vector alignment.  */
 920   else if (nested_in_vect_loop_p (loop, stmt))
 921     {
 922       step_preserves_misalignment_p
 923         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 924
 925       if (dump_enabled_p ())
 926         {
 927           if (step_preserves_misalignment_p)
 928             dump_printf_loc (MSG_NOTE, vect_location,
 929                              "inner step divides the vector alignment.\n");
 930           else
 931             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                              "inner step doesn't divide the vector"
 933                              " alignment.\n");
 934         }
 935     }
 936
 937   /* Similarly we can only use base and misalignment information relative to
 938      an innermost loop if the misalignment stays the same throughout the
 939      execution of the loop.  As above, this is the case if the stride of
 940      the dataref evenly divides by the alignment.  */
 941   else
 942     {
 943       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 944       step_preserves_misalignment_p
 945         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 946
 947       if (!step_preserves_misalignment_p && dump_enabled_p ())
 948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 949                          "step doesn't divide the vector alignment.\n");
 950     }
 951
 952   unsigned int base_alignment = drb->base_alignment;
 953   unsigned int base_misalignment = drb->base_misalignment;
 954
 955   /* Calculate the maximum of the pooled base address alignment and the
 956      alignment that we can compute for DR itself.  */
 957   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 958   if (entry && base_alignment < (*entry)->base_alignment)
 959     {
 960       base_alignment = (*entry)->base_alignment;
 961       base_misalignment = (*entry)->base_misalignment;
 962     }
 963
 964   if (drb->offset_alignment < vector_alignment
 965       || !step_preserves_misalignment_p
 966       /* We need to know whether the step wrt the vectorized loop is
 967          negative when computing the starting misalignment below.  */
 968       || TREE_CODE (drb->step) != INTEGER_CST)
 969     {
 970       if (dump_enabled_p ())
 971         {
 972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 973                            "Unknown alignment for access: ");
 974           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 975           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 976         }
 977       return;
 978     }
 979
 980   if (base_alignment < vector_alignment)
 981     {
 982       unsigned int max_alignment;
 983       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 984       if (max_alignment < vector_alignment
 985           || !vect_can_force_dr_alignment_p (base,
 986                                              vector_alignment * BITS_PER_UNIT))
 987         {
 988           if (dump_enabled_p ())
 989             {
 990               dump_printf_loc (MSG_NOTE, vect_location,
 991                                "can't force alignment of ref: ");
 992               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 993               dump_printf (MSG_NOTE, "\n");
 994             }
 995           return;
 996         }
 997
 998       /* Force the alignment of the decl.
 999          NOTE: This is the only change to the code we make during
1000          the analysis phase, before deciding to vectorize the loop.  */
1001       if (dump_enabled_p ())
1002         {
1003           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1004           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005           dump_printf (MSG_NOTE, "\n");
1006         }
1007
1008       DR_VECT_AUX (dr)->base_decl = base;
1009       DR_VECT_AUX (dr)->base_misaligned = true;
1010       base_misalignment = 0;
1011     }
1012   poly_int64 misalignment
1013     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1014
1015   /* If this is a backward running DR then first access in the larger
1016      vectype actually is N-1 elements before the address in the DR.
1017      Adjust misalign accordingly.  */
1018   if (tree_int_cst_sgn (drb->step) < 0)
1019     /* PLUS because STEP is negative.  */
1020     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1021                      * TREE_INT_CST_LOW (drb->step));
1022
1023   unsigned int const_misalignment;
1024   if (!known_misalignment (misalignment, vector_alignment,
1025                            &const_misalignment))
1026     {
1027       if (dump_enabled_p ())
1028         {
1029           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030                            "Non-constant misalignment for access: ");
1031           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1032           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1033         }
1034       return;
1035     }
1036
1037   SET_DR_MISALIGNMENT (dr, const_misalignment);
1038
1039   if (dump_enabled_p ())
1040     {
1041       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1043       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045     }
1046
1047   return;
1048 }
1049
1050 /* Function vect_update_misalignment_for_peel.
1051    Sets DR's misalignment
1052    - to 0 if it has the same alignment as DR_PEEL,
1053    - to the misalignment computed using NPEEL if DR's salignment is known,
1054    - to -1 (unknown) otherwise.
1055
1056    DR - the data reference whose misalignment is to be adjusted.
1057    DR_PEEL - the data reference whose misalignment is being made
1058              zero in the vector loop by the peel.
1059    NPEEL - the number of iterations in the peel loop if the misalignment
1060            of DR_PEEL is known at compile time.  */
1061
1062 static void
1063 vect_update_misalignment_for_peel (struct data_reference *dr,
1064                                    struct data_reference *dr_peel, int npeel)
1065 {
1066   unsigned int i;
1067   vec<dr_p> same_aligned_drs;
1068   struct data_reference *current_dr;
1069   int dr_size = vect_get_scalar_dr_size (dr);
1070   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1071   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1072   stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1073
1074  /* For interleaved data accesses the step in the loop must be multiplied by
1075      the size of the interleaving group.  */
1076   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1078   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1079     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1080
1081   /* It can be assumed that the data refs with the same alignment as dr_peel
1082      are aligned in the vector loop.  */
1083   same_aligned_drs
1084     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1085   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1086     {
1087       if (current_dr != dr)
1088         continue;
1089       gcc_assert (!known_alignment_for_access_p (dr)
1090                   || !known_alignment_for_access_p (dr_peel)
1091                   || (DR_MISALIGNMENT (dr) / dr_size
1092                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1093       SET_DR_MISALIGNMENT (dr, 0);
1094       return;
1095     }
1096
1097   if (known_alignment_for_access_p (dr)
1098       && known_alignment_for_access_p (dr_peel))
1099     {
1100       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1101       int misal = DR_MISALIGNMENT (dr);
1102       misal += negative ? -npeel * dr_size : npeel * dr_size;
1103       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1104       SET_DR_MISALIGNMENT (dr, misal);
1105       return;
1106     }
1107
1108   if (dump_enabled_p ())
1109     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1110                      "to unknown (-1).\n");
1111   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1112 }
1113
1114
1115 /* Function verify_data_ref_alignment
1116
1117    Return TRUE if DR can be handled with respect to alignment.  */
1118
1119 static bool
1120 verify_data_ref_alignment (data_reference_p dr)
1121 {
1122   enum dr_alignment_support supportable_dr_alignment
1123     = vect_supportable_dr_alignment (dr, false);
1124   if (!supportable_dr_alignment)
1125     {
1126       if (dump_enabled_p ())
1127         {
1128           if (DR_IS_READ (dr))
1129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130                              "not vectorized: unsupported unaligned load.");
1131           else
1132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                              "not vectorized: unsupported unaligned "
1134                              "store.");
1135
1136           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1137                              DR_REF (dr));
1138           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1139         }
1140       return false;
1141     }
1142
1143   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1144     dump_printf_loc (MSG_NOTE, vect_location,
1145                      "Vectorizing an unaligned access.\n");
1146
1147   return true;
1148 }
1149
1150 /* Function vect_verify_datarefs_alignment
1151
1152    Return TRUE if all data references in the loop can be
1153    handled with respect to alignment.  */
1154
1155 bool
1156 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1157 {
1158   vec<data_reference_p> datarefs = vinfo->datarefs;
1159   struct data_reference *dr;
1160   unsigned int i;
1161
1162   FOR_EACH_VEC_ELT (datarefs, i, dr)
1163     {
1164       gimple *stmt = vect_dr_stmt (dr);
1165       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1166
1167       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1168         continue;
1169
1170       /* For interleaving, only the alignment of the first access matters.   */
1171       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1172           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1173         continue;
1174
1175       /* Strided accesses perform only component accesses, alignment is
1176          irrelevant for them.  */
1177       if (STMT_VINFO_STRIDED_P (stmt_info)
1178           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1179         continue;
1180
1181       if (! verify_data_ref_alignment (dr))
1182         return false;
1183     }
1184
1185   return true;
1186 }
1187
1188 /* Given an memory reference EXP return whether its alignment is less
1189    than its size.  */
1190
1191 static bool
1192 not_size_aligned (tree exp)
1193 {
1194   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1195     return true;
1196
1197   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1198           > get_object_alignment (exp));
1199 }
1200
1201 /* Function vector_alignment_reachable_p
1202
1203    Return true if vector alignment for DR is reachable by peeling
1204    a few loop iterations.  Return false otherwise.  */
1205
1206 static bool
1207 vector_alignment_reachable_p (struct data_reference *dr)
1208 {
1209   gimple *stmt = vect_dr_stmt (dr);
1210   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1212
1213   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1214     {
1215       /* For interleaved access we peel only if number of iterations in
1216          the prolog loop ({VF - misalignment}), is a multiple of the
1217          number of the interleaved accesses.  */
1218       int elem_size, mis_in_elements;
1219
1220       /* FORNOW: handle only known alignment.  */
1221       if (!known_alignment_for_access_p (dr))
1222         return false;
1223
1224       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226       elem_size = vector_element_size (vector_size, nelements);
1227       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1228
1229       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230         return false;
1231     }
1232
1233   /* If misalignment is known at the compile time then allow peeling
1234      only if natural alignment is reachable through peeling.  */
1235   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1236     {
1237       HOST_WIDE_INT elmsize =
1238                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239       if (dump_enabled_p ())
1240         {
1241           dump_printf_loc (MSG_NOTE, vect_location,
1242                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243           dump_printf (MSG_NOTE,
1244                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1245         }
1246       if (DR_MISALIGNMENT (dr) % elmsize)
1247         {
1248           if (dump_enabled_p ())
1249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                              "data size does not divide the misalignment.\n");
1251           return false;
1252         }
1253     }
1254
1255   if (!known_alignment_for_access_p (dr))
1256     {
1257       tree type = TREE_TYPE (DR_REF (dr));
1258       bool is_packed = not_size_aligned (DR_REF (dr));
1259       if (dump_enabled_p ())
1260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261                          "Unknown misalignment, %snaturally aligned\n",
1262                          is_packed ? "not " : "");
1263       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1264     }
1265
1266   return true;
1267 }
1268
1269
1270 /* Calculate the cost of the memory access represented by DR.  */
1271
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274                            unsigned int *inside_cost,
1275                            unsigned int *outside_cost,
1276                            stmt_vector_for_cost *body_cost_vec,
1277                            stmt_vector_for_cost *prologue_cost_vec)
1278 {
1279   gimple *stmt = vect_dr_stmt (dr);
1280   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1281   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282   int ncopies;
1283
1284   if (PURE_SLP_STMT (stmt_info))
1285     ncopies = 1;
1286   else
1287     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1288
1289   if (DR_IS_READ (dr))
1290     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291                         prologue_cost_vec, body_cost_vec, false);
1292   else
1293     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1294
1295   if (dump_enabled_p ())
1296     dump_printf_loc (MSG_NOTE, vect_location,
1297                      "vect_get_data_access_cost: inside_cost = %d, "
1298                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1299 }
1300
1301
1302 typedef struct _vect_peel_info
1303 {
1304   struct data_reference *dr;
1305   int npeel;
1306   unsigned int count;
1307 } *vect_peel_info;
1308
1309 typedef struct _vect_peel_extended_info
1310 {
1311   struct _vect_peel_info peel_info;
1312   unsigned int inside_cost;
1313   unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1315
1316
1317 /* Peeling hashtable helpers.  */
1318
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1320 {
1321   static inline hashval_t hash (const _vect_peel_info *);
1322   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1323 };
1324
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1327 {
1328   return (hashval_t) peel_info->npeel;
1329 }
1330
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1333 {
1334   return (a->npeel == b->npeel);
1335 }
1336
1337
1338 /* Insert DR into peeling hash table with NPEEL as key.  */
1339
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342                           loop_vec_info loop_vinfo, struct data_reference *dr,
1343                           int npeel)
1344 {
1345   struct _vect_peel_info elem, *slot;
1346   _vect_peel_info **new_slot;
1347   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1348
1349   elem.npeel = npeel;
1350   slot = peeling_htab->find (&elem);
1351   if (slot)
1352     slot->count++;
1353   else
1354     {
1355       slot = XNEW (struct _vect_peel_info);
1356       slot->npeel = npeel;
1357       slot->dr = dr;
1358       slot->count = 1;
1359       new_slot = peeling_htab->find_slot (slot, INSERT);
1360       *new_slot = slot;
1361     }
1362
1363   if (!supportable_dr_alignment
1364       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365     slot->count += VECT_MAX_COST;
1366 }
1367
1368
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370    number of data accesses.  */
1371
1372 int
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374                                      _vect_peel_extended_info *max)
1375 {
1376   vect_peel_info elem = *slot;
1377
1378   if (elem->count > max->peel_info.count
1379       || (elem->count == max->peel_info.count
1380           && max->peel_info.npeel > elem->npeel))
1381     {
1382       max->peel_info.npeel = elem->npeel;
1383       max->peel_info.count = elem->count;
1384       max->peel_info.dr = elem->dr;
1385     }
1386
1387   return 1;
1388 }
1389
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392    misalignment will be zero after peeling.  */
1393
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396                                 struct data_reference *dr0,
1397                                 unsigned int *inside_cost,
1398                                 unsigned int *outside_cost,
1399                                 stmt_vector_for_cost *body_cost_vec,
1400                                 stmt_vector_for_cost *prologue_cost_vec,
1401                                 unsigned int npeel,
1402                                 bool unknown_misalignment)
1403 {
1404   unsigned i;
1405   data_reference *dr;
1406
1407   FOR_EACH_VEC_ELT (datarefs, i, dr)
1408     {
1409       gimple *stmt = vect_dr_stmt (dr);
1410       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412         continue;
1413
1414       /* For interleaving, only the alignment of the first access
1415          matters.  */
1416       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418         continue;
1419
1420       /* Strided accesses perform only component accesses, alignment is
1421          irrelevant for them.  */
1422       if (STMT_VINFO_STRIDED_P (stmt_info)
1423           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1424         continue;
1425
1426       int save_misalignment;
1427       save_misalignment = DR_MISALIGNMENT (dr);
1428       if (npeel == 0)
1429         ;
1430       else if (unknown_misalignment && dr == dr0)
1431         SET_DR_MISALIGNMENT (dr, 0);
1432       else
1433         vect_update_misalignment_for_peel (dr, dr0, npeel);
1434       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1435                                  body_cost_vec, prologue_cost_vec);
1436       SET_DR_MISALIGNMENT (dr, save_misalignment);
1437     }
1438 }
1439
1440 /* Traverse peeling hash table and calculate cost for each peeling option.
1441    Find the one with the lowest cost.  */
1442
1443 int
1444 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1445                                    _vect_peel_extended_info *min)
1446 {
1447   vect_peel_info elem = *slot;
1448   int dummy;
1449   unsigned int inside_cost = 0, outside_cost = 0;
1450   gimple *stmt = vect_dr_stmt (elem->dr);
1451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1453   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1454                        epilogue_cost_vec;
1455
1456   prologue_cost_vec.create (2);
1457   body_cost_vec.create (2);
1458   epilogue_cost_vec.create (2);
1459
1460   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1461                                   elem->dr, &inside_cost, &outside_cost,
1462                                   &body_cost_vec, &prologue_cost_vec,
1463                                   elem->npeel, false);
1464
1465   body_cost_vec.release ();
1466
1467   outside_cost += vect_get_known_peeling_cost
1468     (loop_vinfo, elem->npeel, &dummy,
1469      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1470      &prologue_cost_vec, &epilogue_cost_vec);
1471
1472   /* Prologue and epilogue costs are added to the target model later.
1473      These costs depend only on the scalar iteration cost, the
1474      number of peeling iterations finally chosen, and the number of
1475      misaligned statements.  So discard the information found here.  */
1476   prologue_cost_vec.release ();
1477   epilogue_cost_vec.release ();
1478
1479   if (inside_cost < min->inside_cost
1480       || (inside_cost == min->inside_cost
1481           && outside_cost < min->outside_cost))
1482     {
1483       min->inside_cost = inside_cost;
1484       min->outside_cost = outside_cost;
1485       min->peel_info.dr = elem->dr;
1486       min->peel_info.npeel = elem->npeel;
1487       min->peel_info.count = elem->count;
1488     }
1489
1490   return 1;
1491 }
1492
1493
1494 /* Choose best peeling option by traversing peeling hash table and either
1495    choosing an option with the lowest cost (if cost model is enabled) or the
1496    option that aligns as many accesses as possible.  */
1497
1498 static struct _vect_peel_extended_info
1499 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1500                                        loop_vec_info loop_vinfo)
1501 {
1502    struct _vect_peel_extended_info res;
1503
1504    res.peel_info.dr = NULL;
1505
1506    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507      {
1508        res.inside_cost = INT_MAX;
1509        res.outside_cost = INT_MAX;
1510        peeling_htab->traverse <_vect_peel_extended_info *,
1511                                vect_peeling_hash_get_lowest_cost> (&res);
1512      }
1513    else
1514      {
1515        res.peel_info.count = 0;
1516        peeling_htab->traverse <_vect_peel_extended_info *,
1517                                vect_peeling_hash_get_most_frequent> (&res);
1518        res.inside_cost = 0;
1519        res.outside_cost = 0;
1520      }
1521
1522    return res;
1523 }
1524
1525 /* Return true if the new peeling NPEEL is supported.  */
1526
1527 static bool
1528 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1529                           unsigned npeel)
1530 {
1531   unsigned i;
1532   struct data_reference *dr = NULL;
1533   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534   gimple *stmt;
1535   stmt_vec_info stmt_info;
1536   enum dr_alignment_support supportable_dr_alignment;
1537
1538   /* Ensure that all data refs can be vectorized after the peel.  */
1539   FOR_EACH_VEC_ELT (datarefs, i, dr)
1540     {
1541       int save_misalignment;
1542
1543       if (dr == dr0)
1544         continue;
1545
1546       stmt = vect_dr_stmt (dr);
1547       stmt_info = vinfo_for_stmt (stmt);
1548       /* For interleaving, only the alignment of the first access
1549          matters.  */
1550       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1551           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1552         continue;
1553
1554       /* Strided accesses perform only component accesses, alignment is
1555          irrelevant for them.  */
1556       if (STMT_VINFO_STRIDED_P (stmt_info)
1557           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1558         continue;
1559
1560       save_misalignment = DR_MISALIGNMENT (dr);
1561       vect_update_misalignment_for_peel (dr, dr0, npeel);
1562       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1563       SET_DR_MISALIGNMENT (dr, save_misalignment);
1564
1565       if (!supportable_dr_alignment)
1566         return false;
1567     }
1568
1569   return true;
1570 }
1571
1572 /* Function vect_enhance_data_refs_alignment
1573
1574    This pass will use loop versioning and loop peeling in order to enhance
1575    the alignment of data references in the loop.
1576
1577    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1578    original loop is to be vectorized.  Any other loops that are created by
1579    the transformations performed in this pass - are not supposed to be
1580    vectorized.  This restriction will be relaxed.
1581
1582    This pass will require a cost model to guide it whether to apply peeling
1583    or versioning or a combination of the two.  For example, the scheme that
1584    intel uses when given a loop with several memory accesses, is as follows:
1585    choose one memory access ('p') which alignment you want to force by doing
1586    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1587    other accesses are not necessarily aligned, or (2) use loop versioning to
1588    generate one loop in which all accesses are aligned, and another loop in
1589    which only 'p' is necessarily aligned.
1590
1591    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1592    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1593    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1594
1595    Devising a cost model is the most critical aspect of this work.  It will
1596    guide us on which access to peel for, whether to use loop versioning, how
1597    many versions to create, etc.  The cost model will probably consist of
1598    generic considerations as well as target specific considerations (on
1599    powerpc for example, misaligned stores are more painful than misaligned
1600    loads).
1601
1602    Here are the general steps involved in alignment enhancements:
1603
1604      -- original loop, before alignment analysis:
1605         for (i=0; i<N; i++){
1606           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1607           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1608         }
1609
1610      -- After vect_compute_data_refs_alignment:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- Possibility 1: we do loop versioning:
1617      if (p is aligned) {
1618         for (i=0; i<N; i++){    # loop 1A
1619           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1620           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1621         }
1622      }
1623      else {
1624         for (i=0; i<N; i++){    # loop 1B
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1627         }
1628      }
1629
1630      -- Possibility 2: we do loop peeling:
1631      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1632         x = q[i];
1633         p[i] = y;
1634      }
1635      for (i = 3; i < N; i++){   # loop 2A
1636         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1637         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1638      }
1639
1640      -- Possibility 3: combination of loop peeling and versioning:
1641      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1642         x = q[i];
1643         p[i] = y;
1644      }
1645      if (p is aligned) {
1646         for (i = 3; i<N; i++){  # loop 3A
1647           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1648           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1649         }
1650      }
1651      else {
1652         for (i = 3; i<N; i++){  # loop 3B
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1655         }
1656      }
1657
1658      These loops are later passed to loop_transform to be vectorized.  The
1659      vectorizer will use the alignment information to guide the transformation
1660      (whether to generate regular loads/stores, or with special handling for
1661      misalignment).  */
1662
1663 bool
1664 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1665 {
1666   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1667   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669   struct data_reference *dr0 = NULL, *first_store = NULL;
1670   struct data_reference *dr;
1671   unsigned int i, j;
1672   bool do_peeling = false;
1673   bool do_versioning = false;
1674   bool stat;
1675   gimple *stmt;
1676   stmt_vec_info stmt_info;
1677   unsigned int npeel = 0;
1678   bool one_misalignment_known = false;
1679   bool one_misalignment_unknown = false;
1680   bool one_dr_unsupportable = false;
1681   struct data_reference *unsupportable_dr = NULL;
1682   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1683   unsigned possible_npeel_number = 1;
1684   tree vectype;
1685   unsigned int mis, same_align_drs_max = 0;
1686   hash_table<peel_info_hasher> peeling_htab (1);
1687
1688   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1689
1690   /* Reset data so we can safely be called multiple times.  */
1691   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1692   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1693
1694   /* While cost model enhancements are expected in the future, the high level
1695      view of the code at this time is as follows:
1696
1697      A) If there is a misaligned access then see if peeling to align
1698         this access can make all data references satisfy
1699         vect_supportable_dr_alignment.  If so, update data structures
1700         as needed and return true.
1701
1702      B) If peeling wasn't possible and there is a data reference with an
1703         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1704         then see if loop versioning checks can be used to make all data
1705         references satisfy vect_supportable_dr_alignment.  If so, update
1706         data structures as needed and return true.
1707
1708      C) If neither peeling nor versioning were successful then return false if
1709         any data reference does not satisfy vect_supportable_dr_alignment.
1710
1711      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1712
1713      Note, Possibility 3 above (which is peeling and versioning together) is not
1714      being done at this time.  */
1715
1716   /* (1) Peeling to force alignment.  */
1717
1718   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1719      Considerations:
1720      + How many accesses will become aligned due to the peeling
1721      - How many accesses will become unaligned due to the peeling,
1722        and the cost of misaligned accesses.
1723      - The cost of peeling (the extra runtime checks, the increase
1724        in code size).  */
1725
1726   FOR_EACH_VEC_ELT (datarefs, i, dr)
1727     {
1728       stmt = vect_dr_stmt (dr);
1729       stmt_info = vinfo_for_stmt (stmt);
1730
1731       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1732         continue;
1733
1734       /* For interleaving, only the alignment of the first access
1735          matters.  */
1736       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1737           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1738         continue;
1739
1740       /* For scatter-gather or invariant accesses there is nothing
1741          to enhance.  */
1742       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1743           || integer_zerop (DR_STEP (dr)))
1744         continue;
1745
1746       /* Strided accesses perform only component accesses, alignment is
1747          irrelevant for them.  */
1748       if (STMT_VINFO_STRIDED_P (stmt_info)
1749           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1750         continue;
1751
1752       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1753       do_peeling = vector_alignment_reachable_p (dr);
1754       if (do_peeling)
1755         {
1756           if (known_alignment_for_access_p (dr))
1757             {
1758               unsigned int npeel_tmp = 0;
1759               bool negative = tree_int_cst_compare (DR_STEP (dr),
1760                                                     size_zero_node) < 0;
1761
1762               vectype = STMT_VINFO_VECTYPE (stmt_info);
1763               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1764               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1765               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1766               if (DR_MISALIGNMENT (dr) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0 = dr;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   struct loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0 = dr;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr = dr;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (datarefs, dr0,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (datarefs, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0 = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS
1941         (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1942     }
1943
1944   peel_for_unknown_alignment.peel_info.npeel = 0;
1945   peel_for_unknown_alignment.peel_info.dr = dr0;
1946
1947   best_peel = peel_for_unknown_alignment;
1948
1949   peel_for_known_alignment.inside_cost = INT_MAX;
1950   peel_for_known_alignment.outside_cost = INT_MAX;
1951   peel_for_known_alignment.peel_info.count = 0;
1952   peel_for_known_alignment.peel_info.dr = NULL;
1953
1954   if (do_peeling && one_misalignment_known)
1955     {
1956       /* Peeling is possible, but there is no data access that is not supported
1957          unless aligned.  So we try to choose the best possible peeling from
1958          the hash table.  */
1959       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1960         (&peeling_htab, loop_vinfo);
1961     }
1962
1963   /* Compare costs of peeling for known and unknown alignment. */
1964   if (peel_for_known_alignment.peel_info.dr != NULL
1965       && peel_for_unknown_alignment.inside_cost
1966       >= peel_for_known_alignment.inside_cost)
1967     {
1968       best_peel = peel_for_known_alignment;
1969
1970       /* If the best peeling for known alignment has NPEEL == 0, perform no
1971          peeling at all except if there is an unsupportable dr that we can
1972          align.  */
1973       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1974         do_peeling = false;
1975     }
1976
1977   /* If there is an unsupportable data ref, prefer this over all choices so far
1978      since we'd have to discard a chosen peeling except when it accidentally
1979      aligned the unsupportable data ref.  */
1980   if (one_dr_unsupportable)
1981     dr0 = unsupportable_dr;
1982   else if (do_peeling)
1983     {
1984       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1985          TODO: Use nopeel_outside_cost or get rid of it?  */
1986       unsigned nopeel_inside_cost = 0;
1987       unsigned nopeel_outside_cost = 0;
1988
1989       stmt_vector_for_cost dummy;
1990       dummy.create (2);
1991       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1992                                       &nopeel_outside_cost, &dummy, &dummy,
1993                                       0, false);
1994       dummy.release ();
1995
1996       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1997          costs will be recorded.  */
1998       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1999       prologue_cost_vec.create (2);
2000       epilogue_cost_vec.create (2);
2001
2002       int dummy2;
2003       nopeel_outside_cost += vect_get_known_peeling_cost
2004         (loop_vinfo, 0, &dummy2,
2005          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006          &prologue_cost_vec, &epilogue_cost_vec);
2007
2008       prologue_cost_vec.release ();
2009       epilogue_cost_vec.release ();
2010
2011       npeel = best_peel.peel_info.npeel;
2012       dr0 = best_peel.peel_info.dr;
2013
2014       /* If no peeling is not more expensive than the best peeling we
2015          have so far, don't perform any peeling.  */
2016       if (nopeel_inside_cost <= best_peel.inside_cost)
2017         do_peeling = false;
2018     }
2019
2020   if (do_peeling)
2021     {
2022       stmt = vect_dr_stmt (dr0);
2023       stmt_info = vinfo_for_stmt (stmt);
2024       vectype = STMT_VINFO_VECTYPE (stmt_info);
2025
2026       if (known_alignment_for_access_p (dr0))
2027         {
2028           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2029                                                 size_zero_node) < 0;
2030           if (!npeel)
2031             {
2032               /* Since it's known at compile time, compute the number of
2033                  iterations in the peeled loop (the peeling factor) for use in
2034                  updating DR_MISALIGNMENT values.  The peeling factor is the
2035                  vectorization factor minus the misalignment as an element
2036                  count.  */
2037               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2038               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2039               npeel = ((mis & (target_align - 1))
2040                        / vect_get_scalar_dr_size (dr0));
2041             }
2042
2043           /* For interleaved data access every iteration accesses all the
2044              members of the group, therefore we divide the number of iterations
2045              by the group size.  */
2046           stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2047           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048             npeel /= DR_GROUP_SIZE (stmt_info);
2049
2050           if (dump_enabled_p ())
2051             dump_printf_loc (MSG_NOTE, vect_location,
2052                              "Try peeling by %d\n", npeel);
2053         }
2054
2055       /* Ensure that all datarefs can be vectorized after the peel.  */
2056       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2057         do_peeling = false;
2058
2059       /* Check if all datarefs are supportable and log.  */
2060       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2061         {
2062           stat = vect_verify_datarefs_alignment (loop_vinfo);
2063           if (!stat)
2064             do_peeling = false;
2065           else
2066             return stat;
2067         }
2068
2069       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2070       if (do_peeling)
2071         {
2072           unsigned max_allowed_peel
2073             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2074           if (max_allowed_peel != (unsigned)-1)
2075             {
2076               unsigned max_peel = npeel;
2077               if (max_peel == 0)
2078                 {
2079                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2080                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2081                 }
2082               if (max_peel > max_allowed_peel)
2083                 {
2084                   do_peeling = false;
2085                   if (dump_enabled_p ())
2086                     dump_printf_loc (MSG_NOTE, vect_location,
2087                         "Disable peeling, max peels reached: %d\n", max_peel);
2088                 }
2089             }
2090         }
2091
2092       /* Cost model #2 - if peeling may result in a remaining loop not
2093          iterating enough to be vectorized then do not peel.  Since this
2094          is a cost heuristic rather than a correctness decision, use the
2095          most likely runtime value for variable vectorization factors.  */
2096       if (do_peeling
2097           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2098         {
2099           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2100           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2101           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2102               < assumed_vf + max_peel)
2103             do_peeling = false;
2104         }
2105
2106       if (do_peeling)
2107         {
2108           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2109              If the misalignment of DR_i is identical to that of dr0 then set
2110              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2111              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2112              by the peeling factor times the element size of DR_i (MOD the
2113              vectorization factor times the size).  Otherwise, the
2114              misalignment of DR_i must be set to unknown.  */
2115           FOR_EACH_VEC_ELT (datarefs, i, dr)
2116             if (dr != dr0)
2117               {
2118                 /* Strided accesses perform only component accesses, alignment
2119                    is irrelevant for them.  */
2120                 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2121                 if (STMT_VINFO_STRIDED_P (stmt_info)
2122                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2123                   continue;
2124
2125                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2126               }
2127
2128           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2129           if (npeel)
2130             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2131           else
2132             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2133               = DR_MISALIGNMENT (dr0);
2134           SET_DR_MISALIGNMENT (dr0, 0);
2135           if (dump_enabled_p ())
2136             {
2137               dump_printf_loc (MSG_NOTE, vect_location,
2138                                "Alignment of access forced using peeling.\n");
2139               dump_printf_loc (MSG_NOTE, vect_location,
2140                                "Peeling for alignment will be applied.\n");
2141             }
2142
2143           /* The inside-loop cost will be accounted for in vectorizable_load
2144              and vectorizable_store correctly with adjusted alignments.
2145              Drop the body_cst_vec on the floor here.  */
2146           stat = vect_verify_datarefs_alignment (loop_vinfo);
2147           gcc_assert (stat);
2148           return stat;
2149         }
2150     }
2151
2152   /* (2) Versioning to force alignment.  */
2153
2154   /* Try versioning if:
2155      1) optimize loop for speed
2156      2) there is at least one unsupported misaligned data ref with an unknown
2157         misalignment, and
2158      3) all misaligned data refs with a known misalignment are supported, and
2159      4) the number of runtime alignment checks is within reason.  */
2160
2161   do_versioning =
2162         optimize_loop_nest_for_speed_p (loop)
2163         && (!loop->inner); /* FORNOW */
2164
2165   if (do_versioning)
2166     {
2167       FOR_EACH_VEC_ELT (datarefs, i, dr)
2168         {
2169           stmt = vect_dr_stmt (dr);
2170           stmt_info = vinfo_for_stmt (stmt);
2171
2172           /* For interleaving, only the alignment of the first access
2173              matters.  */
2174           if (aligned_access_p (dr)
2175               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2176                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2177             continue;
2178
2179           if (STMT_VINFO_STRIDED_P (stmt_info))
2180             {
2181               /* Strided loads perform only component accesses, alignment is
2182                  irrelevant for them.  */
2183               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184                 continue;
2185               do_versioning = false;
2186               break;
2187             }
2188
2189           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2190
2191           if (!supportable_dr_alignment)
2192             {
2193               gimple *stmt;
2194               int mask;
2195               tree vectype;
2196
2197               if (known_alignment_for_access_p (dr)
2198                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2199                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2200                 {
2201                   do_versioning = false;
2202                   break;
2203                 }
2204
2205               stmt = vect_dr_stmt (dr);
2206               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2207               gcc_assert (vectype);
2208
2209               /* At present we don't support versioning for alignment
2210                  with variable VF, since there's no guarantee that the
2211                  VF is a power of two.  We could relax this if we added
2212                  a way of enforcing a power-of-two size.  */
2213               unsigned HOST_WIDE_INT size;
2214               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2215                 {
2216                   do_versioning = false;
2217                   break;
2218                 }
2219
2220               /* The rightmost bits of an aligned address must be zeros.
2221                  Construct the mask needed for this test.  For example,
2222                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2223                  mask must be 15 = 0xf. */
2224               mask = size - 1;
2225
2226               /* FORNOW: use the same mask to test all potentially unaligned
2227                  references in the loop.  The vectorizer currently supports
2228                  a single vector size, see the reference to
2229                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2230                  vectorization factor is computed.  */
2231               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2232                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2233               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2234               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2235                       vect_dr_stmt (dr));
2236             }
2237         }
2238
2239       /* Versioning requires at least one misaligned data reference.  */
2240       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2241         do_versioning = false;
2242       else if (!do_versioning)
2243         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2244     }
2245
2246   if (do_versioning)
2247     {
2248       vec<gimple *> may_misalign_stmts
2249         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2250       gimple *stmt;
2251
2252       /* It can now be assumed that the data references in the statements
2253          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2254          of the loop being vectorized.  */
2255       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2256         {
2257           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258           dr = STMT_VINFO_DATA_REF (stmt_info);
2259           SET_DR_MISALIGNMENT (dr, 0);
2260           if (dump_enabled_p ())
2261             dump_printf_loc (MSG_NOTE, vect_location,
2262                              "Alignment of access forced using versioning.\n");
2263         }
2264
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_NOTE, vect_location,
2267                          "Versioning for alignment will be applied.\n");
2268
2269       /* Peeling and versioning can't be done together at this time.  */
2270       gcc_assert (! (do_peeling && do_versioning));
2271
2272       stat = vect_verify_datarefs_alignment (loop_vinfo);
2273       gcc_assert (stat);
2274       return stat;
2275     }
2276
2277   /* This point is reached if neither peeling nor versioning is being done.  */
2278   gcc_assert (! (do_peeling || do_versioning));
2279
2280   stat = vect_verify_datarefs_alignment (loop_vinfo);
2281   return stat;
2282 }
2283
2284
2285 /* Function vect_find_same_alignment_drs.
2286
2287    Update group and alignment relations according to the chosen
2288    vectorization factor.  */
2289
2290 static void
2291 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2292 {
2293   struct data_reference *dra = DDR_A (ddr);
2294   struct data_reference *drb = DDR_B (ddr);
2295   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2296   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2297
2298   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2299     return;
2300
2301   if (dra == drb)
2302     return;
2303
2304   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2305       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2306     return;
2307
2308   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2309       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2310       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2311     return;
2312
2313   /* Two references with distance zero have the same alignment.  */
2314   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2315                           - wi::to_poly_offset (DR_INIT (drb)));
2316   if (maybe_ne (diff, 0))
2317     {
2318       /* Get the wider of the two alignments.  */
2319       unsigned int align_a = (vect_calculate_target_alignment (dra)
2320                               / BITS_PER_UNIT);
2321       unsigned int align_b = (vect_calculate_target_alignment (drb)
2322                               / BITS_PER_UNIT);
2323       unsigned int max_align = MAX (align_a, align_b);
2324
2325       /* Require the gap to be a multiple of the larger vector alignment.  */
2326       if (!multiple_p (diff, max_align))
2327         return;
2328     }
2329
2330   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2331   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2332   if (dump_enabled_p ())
2333     {
2334       dump_printf_loc (MSG_NOTE, vect_location,
2335                        "accesses have the same alignment: ");
2336       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2337       dump_printf (MSG_NOTE,  " and ");
2338       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2339       dump_printf (MSG_NOTE, "\n");
2340     }
2341 }
2342
2343
2344 /* Function vect_analyze_data_refs_alignment
2345
2346    Analyze the alignment of the data-references in the loop.
2347    Return FALSE if a data reference is found that cannot be vectorized.  */
2348
2349 bool
2350 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2351 {
2352   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2353
2354   /* Mark groups of data references with same alignment using
2355      data dependence information.  */
2356   vec<ddr_p> ddrs = vinfo->ddrs;
2357   struct data_dependence_relation *ddr;
2358   unsigned int i;
2359
2360   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2361     vect_find_same_alignment_drs (ddr);
2362
2363   vec<data_reference_p> datarefs = vinfo->datarefs;
2364   struct data_reference *dr;
2365
2366   vect_record_base_alignments (vinfo);
2367   FOR_EACH_VEC_ELT (datarefs, i, dr)
2368     {
2369       stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2370       if (STMT_VINFO_VECTORIZABLE (stmt_info))
2371         vect_compute_data_ref_alignment (dr);
2372     }
2373
2374   return true;
2375 }
2376
2377
2378 /* Analyze alignment of DRs of stmts in NODE.  */
2379
2380 static bool
2381 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2382 {
2383   /* We vectorize from the first scalar stmt in the node unless
2384      the node is permuted in which case we start from the first
2385      element in the group.  */
2386   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2387   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2388   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2389     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2390
2391   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392   vect_compute_data_ref_alignment (dr);
2393   /* For creating the data-ref pointer we need alignment of the
2394      first element anyway.  */
2395   if (dr != first_dr)
2396     vect_compute_data_ref_alignment (first_dr);
2397   if (! verify_data_ref_alignment (dr))
2398     {
2399       if (dump_enabled_p ())
2400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401                          "not vectorized: bad data alignment in basic "
2402                          "block.\n");
2403       return false;
2404     }
2405
2406   return true;
2407 }
2408
2409 /* Function vect_slp_analyze_instance_alignment
2410
2411    Analyze the alignment of the data-references in the SLP instance.
2412    Return FALSE if a data reference is found that cannot be vectorized.  */
2413
2414 bool
2415 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2416 {
2417   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2418
2419   slp_tree node;
2420   unsigned i;
2421   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2422     if (! vect_slp_analyze_and_verify_node_alignment (node))
2423       return false;
2424
2425   node = SLP_INSTANCE_TREE (instance);
2426   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2427       && ! vect_slp_analyze_and_verify_node_alignment
2428              (SLP_INSTANCE_TREE (instance)))
2429     return false;
2430
2431   return true;
2432 }
2433
2434
2435 /* Analyze groups of accesses: check that DR belongs to a group of
2436    accesses of legal size, step, etc.  Detect gaps, single element
2437    interleaving, and other special cases. Set grouped access info.
2438    Collect groups of strided stores for further use in SLP analysis.
2439    Worker for vect_analyze_group_access.  */
2440
2441 static bool
2442 vect_analyze_group_access_1 (struct data_reference *dr)
2443 {
2444   tree step = DR_STEP (dr);
2445   tree scalar_type = TREE_TYPE (DR_REF (dr));
2446   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2447   gimple *stmt = vect_dr_stmt (dr);
2448   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2450   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2451   HOST_WIDE_INT dr_step = -1;
2452   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2453   bool slp_impossible = false;
2454
2455   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2456      size of the interleaving group (including gaps).  */
2457   if (tree_fits_shwi_p (step))
2458     {
2459       dr_step = tree_to_shwi (step);
2460       /* Check that STEP is a multiple of type size.  Otherwise there is
2461          a non-element-sized gap at the end of the group which we
2462          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2463          ???  As we can handle non-constant step fine here we should
2464          simply remove uses of DR_GROUP_GAP between the last and first
2465          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2466          simply not include that gap.  */
2467       if ((dr_step % type_size) != 0)
2468         {
2469           if (dump_enabled_p ())
2470             {
2471               dump_printf_loc (MSG_NOTE, vect_location,
2472                                "Step ");
2473               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2474               dump_printf (MSG_NOTE,
2475                            " is not a multiple of the element size for ");
2476               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2477               dump_printf (MSG_NOTE, "\n");
2478             }
2479           return false;
2480         }
2481       groupsize = absu_hwi (dr_step) / type_size;
2482     }
2483   else
2484     groupsize = 0;
2485
2486   /* Not consecutive access is possible only if it is a part of interleaving.  */
2487   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2488     {
2489       /* Check if it this DR is a part of interleaving, and is a single
2490          element of the group that is accessed in the loop.  */
2491
2492       /* Gaps are supported only for loads. STEP must be a multiple of the type
2493          size.  */
2494       if (DR_IS_READ (dr)
2495           && (dr_step % type_size) == 0
2496           && groupsize > 0)
2497         {
2498           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2499           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2500           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2501           if (dump_enabled_p ())
2502             {
2503               dump_printf_loc (MSG_NOTE, vect_location,
2504                                "Detected single element interleaving ");
2505               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2506               dump_printf (MSG_NOTE, " step ");
2507               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2508               dump_printf (MSG_NOTE, "\n");
2509             }
2510
2511           return true;
2512         }
2513
2514       if (dump_enabled_p ())
2515         {
2516           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2517                            "not consecutive access ");
2518           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2519         }
2520
2521       if (bb_vinfo)
2522         {
2523           /* Mark the statement as unvectorizable.  */
2524           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2525           return true;
2526         }
2527
2528       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2529       STMT_VINFO_STRIDED_P (stmt_info) = true;
2530       return true;
2531     }
2532
2533   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2534     {
2535       /* First stmt in the interleaving chain. Check the chain.  */
2536       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2537       struct data_reference *data_ref = dr;
2538       unsigned int count = 1;
2539       tree prev_init = DR_INIT (data_ref);
2540       gimple *prev = stmt;
2541       HOST_WIDE_INT diff, gaps = 0;
2542
2543       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2544       while (next)
2545         {
2546           /* Skip same data-refs.  In case that two or more stmts share
2547              data-ref (supported only for loads), we vectorize only the first
2548              stmt, and the rest get their vectorized loads from the first
2549              one.  */
2550           if (!tree_int_cst_compare (DR_INIT (data_ref),
2551                                      DR_INIT (STMT_VINFO_DATA_REF (
2552                                                    vinfo_for_stmt (next)))))
2553             {
2554               if (DR_IS_WRITE (data_ref))
2555                 {
2556                   if (dump_enabled_p ())
2557                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558                                      "Two store stmts share the same dr.\n");
2559                   return false;
2560                 }
2561
2562               if (dump_enabled_p ())
2563                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564                                  "Two or more load stmts share the same dr.\n");
2565
2566               /* For load use the same data-ref load.  */
2567               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2568
2569               prev = next;
2570               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2571               continue;
2572             }
2573
2574           prev = next;
2575           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2576
2577           /* All group members have the same STEP by construction.  */
2578           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2579
2580           /* Check that the distance between two accesses is equal to the type
2581              size. Otherwise, we have gaps.  */
2582           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2583                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2584           if (diff != 1)
2585             {
2586               /* FORNOW: SLP of accesses with gaps is not supported.  */
2587               slp_impossible = true;
2588               if (DR_IS_WRITE (data_ref))
2589                 {
2590                   if (dump_enabled_p ())
2591                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2592                                      "interleaved store with gaps\n");
2593                   return false;
2594                 }
2595
2596               gaps += diff - 1;
2597             }
2598
2599           last_accessed_element += diff;
2600
2601           /* Store the gap from the previous member of the group. If there is no
2602              gap in the access, DR_GROUP_GAP is always 1.  */
2603           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2604
2605           prev_init = DR_INIT (data_ref);
2606           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2607           /* Count the number of data-refs in the chain.  */
2608           count++;
2609         }
2610
2611       if (groupsize == 0)
2612         groupsize = count + gaps;
2613
2614       /* This could be UINT_MAX but as we are generating code in a very
2615          inefficient way we have to cap earlier.  See PR78699 for example.  */
2616       if (groupsize > 4096)
2617         {
2618           if (dump_enabled_p ())
2619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2620                              "group is too large\n");
2621           return false;
2622         }
2623
2624       /* Check that the size of the interleaving is equal to count for stores,
2625          i.e., that there are no gaps.  */
2626       if (groupsize != count
2627           && !DR_IS_READ (dr))
2628         {
2629           if (dump_enabled_p ())
2630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631                              "interleaved store with gaps\n");
2632           return false;
2633         }
2634
2635       /* If there is a gap after the last load in the group it is the
2636          difference between the groupsize and the last accessed
2637          element.
2638          When there is no gap, this difference should be 0.  */
2639       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2640
2641       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2642       if (dump_enabled_p ())
2643         {
2644           dump_printf_loc (MSG_NOTE, vect_location,
2645                            "Detected interleaving ");
2646           if (DR_IS_READ (dr))
2647             dump_printf (MSG_NOTE, "load ");
2648           else
2649             dump_printf (MSG_NOTE, "store ");
2650           dump_printf (MSG_NOTE, "of size %u starting with ",
2651                        (unsigned)groupsize);
2652           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2653           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2654             dump_printf_loc (MSG_NOTE, vect_location,
2655                              "There is a gap of %u elements after the group\n",
2656                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2657         }
2658
2659       /* SLP: create an SLP data structure for every interleaving group of
2660          stores for further analysis in vect_analyse_slp.  */
2661       if (DR_IS_WRITE (dr) && !slp_impossible)
2662         {
2663           if (loop_vinfo)
2664             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2665           if (bb_vinfo)
2666             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2667         }
2668     }
2669
2670   return true;
2671 }
2672
2673 /* Analyze groups of accesses: check that DR belongs to a group of
2674    accesses of legal size, step, etc.  Detect gaps, single element
2675    interleaving, and other special cases. Set grouped access info.
2676    Collect groups of strided stores for further use in SLP analysis.  */
2677
2678 static bool
2679 vect_analyze_group_access (struct data_reference *dr)
2680 {
2681   if (!vect_analyze_group_access_1 (dr))
2682     {
2683       /* Dissolve the group if present.  */
2684       gimple *next;
2685       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2686       while (stmt)
2687         {
2688           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2689           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2690           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2691           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2692           stmt = next;
2693         }
2694       return false;
2695     }
2696   return true;
2697 }
2698
2699 /* Analyze the access pattern of the data-reference DR.
2700    In case of non-consecutive accesses call vect_analyze_group_access() to
2701    analyze groups of accesses.  */
2702
2703 static bool
2704 vect_analyze_data_ref_access (struct data_reference *dr)
2705 {
2706   tree step = DR_STEP (dr);
2707   tree scalar_type = TREE_TYPE (DR_REF (dr));
2708   gimple *stmt = vect_dr_stmt (dr);
2709   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2710   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2711   struct loop *loop = NULL;
2712
2713   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2714     return true;
2715
2716   if (loop_vinfo)
2717     loop = LOOP_VINFO_LOOP (loop_vinfo);
2718
2719   if (loop_vinfo && !step)
2720     {
2721       if (dump_enabled_p ())
2722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2723                          "bad data-ref access in loop\n");
2724       return false;
2725     }
2726
2727   /* Allow loads with zero step in inner-loop vectorization.  */
2728   if (loop_vinfo && integer_zerop (step))
2729     {
2730       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2731       if (!nested_in_vect_loop_p (loop, stmt))
2732         return DR_IS_READ (dr);
2733       /* Allow references with zero step for outer loops marked
2734          with pragma omp simd only - it guarantees absence of
2735          loop-carried dependencies between inner loop iterations.  */
2736       if (loop->safelen < 2)
2737         {
2738           if (dump_enabled_p ())
2739             dump_printf_loc (MSG_NOTE, vect_location,
2740                              "zero step in inner loop of nest\n");
2741           return false;
2742         }
2743     }
2744
2745   if (loop && nested_in_vect_loop_p (loop, stmt))
2746     {
2747       /* Interleaved accesses are not yet supported within outer-loop
2748         vectorization for references in the inner-loop.  */
2749       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2750
2751       /* For the rest of the analysis we use the outer-loop step.  */
2752       step = STMT_VINFO_DR_STEP (stmt_info);
2753       if (integer_zerop (step))
2754         {
2755           if (dump_enabled_p ())
2756             dump_printf_loc (MSG_NOTE, vect_location,
2757                              "zero step in outer loop.\n");
2758           return DR_IS_READ (dr);
2759         }
2760     }
2761
2762   /* Consecutive?  */
2763   if (TREE_CODE (step) == INTEGER_CST)
2764     {
2765       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2766       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2767           || (dr_step < 0
2768               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2769         {
2770           /* Mark that it is not interleaving.  */
2771           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2772           return true;
2773         }
2774     }
2775
2776   if (loop && nested_in_vect_loop_p (loop, stmt))
2777     {
2778       if (dump_enabled_p ())
2779         dump_printf_loc (MSG_NOTE, vect_location,
2780                          "grouped access in outer loop.\n");
2781       return false;
2782     }
2783
2784
2785   /* Assume this is a DR handled by non-constant strided load case.  */
2786   if (TREE_CODE (step) != INTEGER_CST)
2787     return (STMT_VINFO_STRIDED_P (stmt_info)
2788             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2789                 || vect_analyze_group_access (dr)));
2790
2791   /* Not consecutive access - check if it's a part of interleaving group.  */
2792   return vect_analyze_group_access (dr);
2793 }
2794
2795 /* Compare two data-references DRA and DRB to group them into chunks
2796    suitable for grouping.  */
2797
2798 static int
2799 dr_group_sort_cmp (const void *dra_, const void *drb_)
2800 {
2801   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2802   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2803   int cmp;
2804
2805   /* Stabilize sort.  */
2806   if (dra == drb)
2807     return 0;
2808
2809   /* DRs in different loops never belong to the same group.  */
2810   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2811   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2812   if (loopa != loopb)
2813     return loopa->num < loopb->num ? -1 : 1;
2814
2815   /* Ordering of DRs according to base.  */
2816   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2817                                DR_BASE_ADDRESS (drb));
2818   if (cmp != 0)
2819     return cmp;
2820
2821   /* And according to DR_OFFSET.  */
2822   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2823   if (cmp != 0)
2824     return cmp;
2825
2826   /* Put reads before writes.  */
2827   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2828     return DR_IS_READ (dra) ? -1 : 1;
2829
2830   /* Then sort after access size.  */
2831   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2832                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2833   if (cmp != 0)
2834     return cmp;
2835
2836   /* And after step.  */
2837   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2838   if (cmp != 0)
2839     return cmp;
2840
2841   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2842   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2843   if (cmp == 0)
2844     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2845   return cmp;
2846 }
2847
2848 /* If OP is the result of a conversion, return the unconverted value,
2849    otherwise return null.  */
2850
2851 static tree
2852 strip_conversion (tree op)
2853 {
2854   if (TREE_CODE (op) != SSA_NAME)
2855     return NULL_TREE;
2856   gimple *stmt = SSA_NAME_DEF_STMT (op);
2857   if (!is_gimple_assign (stmt)
2858       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2859     return NULL_TREE;
2860   return gimple_assign_rhs1 (stmt);
2861 }
2862
2863 /* Return true if vectorizable_* routines can handle statements STMT1
2864    and STMT2 being in a single group.  */
2865
2866 static bool
2867 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2868 {
2869   if (gimple_assign_single_p (stmt1))
2870     return gimple_assign_single_p (stmt2);
2871
2872   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2873     {
2874       /* Check for two masked loads or two masked stores.  */
2875       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2876         return false;
2877       internal_fn ifn = gimple_call_internal_fn (stmt1);
2878       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2879         return false;
2880       if (ifn != gimple_call_internal_fn (stmt2))
2881         return false;
2882
2883       /* Check that the masks are the same.  Cope with casts of masks,
2884          like those created by build_mask_conversion.  */
2885       tree mask1 = gimple_call_arg (stmt1, 2);
2886       tree mask2 = gimple_call_arg (stmt2, 2);
2887       if (!operand_equal_p (mask1, mask2, 0))
2888         {
2889           mask1 = strip_conversion (mask1);
2890           if (!mask1)
2891             return false;
2892           mask2 = strip_conversion (mask2);
2893           if (!mask2)
2894             return false;
2895           if (!operand_equal_p (mask1, mask2, 0))
2896             return false;
2897         }
2898       return true;
2899     }
2900
2901   return false;
2902 }
2903
2904 /* Function vect_analyze_data_ref_accesses.
2905
2906    Analyze the access pattern of all the data references in the loop.
2907
2908    FORNOW: the only access pattern that is considered vectorizable is a
2909            simple step 1 (consecutive) access.
2910
2911    FORNOW: handle only arrays and pointer accesses.  */
2912
2913 bool
2914 vect_analyze_data_ref_accesses (vec_info *vinfo)
2915 {
2916   unsigned int i;
2917   vec<data_reference_p> datarefs = vinfo->datarefs;
2918   struct data_reference *dr;
2919
2920   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2921
2922   if (datarefs.is_empty ())
2923     return true;
2924
2925   /* Sort the array of datarefs to make building the interleaving chains
2926      linear.  Don't modify the original vector's order, it is needed for
2927      determining what dependencies are reversed.  */
2928   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2929   datarefs_copy.qsort (dr_group_sort_cmp);
2930
2931   /* Build the interleaving chains.  */
2932   for (i = 0; i < datarefs_copy.length () - 1;)
2933     {
2934       data_reference_p dra = datarefs_copy[i];
2935       stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2936       stmt_vec_info lastinfo = NULL;
2937       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2938           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2939         {
2940           ++i;
2941           continue;
2942         }
2943       for (i = i + 1; i < datarefs_copy.length (); ++i)
2944         {
2945           data_reference_p drb = datarefs_copy[i];
2946           stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2947           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2948               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2949             break;
2950
2951           /* ???  Imperfect sorting (non-compatible types, non-modulo
2952              accesses, same accesses) can lead to a group to be artificially
2953              split here as we don't just skip over those.  If it really
2954              matters we can push those to a worklist and re-iterate
2955              over them.  The we can just skip ahead to the next DR here.  */
2956
2957           /* DRs in a different loop should not be put into the same
2958              interleaving group.  */
2959           if (gimple_bb (DR_STMT (dra))->loop_father
2960               != gimple_bb (DR_STMT (drb))->loop_father)
2961             break;
2962
2963           /* Check that the data-refs have same first location (except init)
2964              and they are both either store or load (not load and store,
2965              not masked loads or stores).  */
2966           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2967               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2968                                         DR_BASE_ADDRESS (drb)) != 0
2969               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2970               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2971             break;
2972
2973           /* Check that the data-refs have the same constant size.  */
2974           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2975           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2976           if (!tree_fits_uhwi_p (sza)
2977               || !tree_fits_uhwi_p (szb)
2978               || !tree_int_cst_equal (sza, szb))
2979             break;
2980
2981           /* Check that the data-refs have the same step.  */
2982           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2983             break;
2984
2985           /* Check the types are compatible.
2986              ???  We don't distinguish this during sorting.  */
2987           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2988                                    TREE_TYPE (DR_REF (drb))))
2989             break;
2990
2991           /* Check that the DR_INITs are compile-time constants.  */
2992           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2993               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2994             break;
2995
2996           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2997           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2998           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2999           HOST_WIDE_INT init_prev
3000             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3001           gcc_assert (init_a <= init_b
3002                       && init_a <= init_prev
3003                       && init_prev <= init_b);
3004
3005           /* Do not place the same access in the interleaving chain twice.  */
3006           if (init_b == init_prev)
3007             {
3008               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3009                           < gimple_uid (DR_STMT (drb)));
3010               /* ???  For now we simply "drop" the later reference which is
3011                  otherwise the same rather than finishing off this group.
3012                  In the end we'd want to re-process duplicates forming
3013                  multiple groups from the refs, likely by just collecting
3014                  all candidates (including duplicates and split points
3015                  below) in a vector and then process them together.  */
3016               continue;
3017             }
3018
3019           /* If init_b == init_a + the size of the type * k, we have an
3020              interleaving, and DRA is accessed before DRB.  */
3021           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3022           if (type_size_a == 0
3023               || (init_b - init_a) % type_size_a != 0)
3024             break;
3025
3026           /* If we have a store, the accesses are adjacent.  This splits
3027              groups into chunks we support (we don't support vectorization
3028              of stores with gaps).  */
3029           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3030             break;
3031
3032           /* If the step (if not zero or non-constant) is greater than the
3033              difference between data-refs' inits this splits groups into
3034              suitable sizes.  */
3035           if (tree_fits_shwi_p (DR_STEP (dra)))
3036             {
3037               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3038               if (step != 0 && step <= (init_b - init_a))
3039                 break;
3040             }
3041
3042           if (dump_enabled_p ())
3043             {
3044               dump_printf_loc (MSG_NOTE, vect_location,
3045                                "Detected interleaving ");
3046               if (DR_IS_READ (dra))
3047                 dump_printf (MSG_NOTE, "load ");
3048               else
3049                 dump_printf (MSG_NOTE, "store ");
3050               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3051               dump_printf (MSG_NOTE,  " and ");
3052               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3053               dump_printf (MSG_NOTE, "\n");
3054             }
3055
3056           /* Link the found element into the group list.  */
3057           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3058             {
3059               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3060               lastinfo = stmtinfo_a;
3061             }
3062           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3063           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3064           lastinfo = stmtinfo_b;
3065         }
3066     }
3067
3068   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3069     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3070         && !vect_analyze_data_ref_access (dr))
3071       {
3072         if (dump_enabled_p ())
3073           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074                            "not vectorized: complicated access pattern.\n");
3075
3076         if (is_a <bb_vec_info> (vinfo))
3077           {
3078             /* Mark the statement as not vectorizable.  */
3079             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3080             continue;
3081           }
3082         else
3083           {
3084             datarefs_copy.release ();
3085             return false;
3086           }
3087       }
3088
3089   datarefs_copy.release ();
3090   return true;
3091 }
3092
3093 /* Function vect_vfa_segment_size.
3094
3095    Input:
3096      DR: The data reference.
3097      LENGTH_FACTOR: segment length to consider.
3098
3099    Return a value suitable for the dr_with_seg_len::seg_len field.
3100    This is the "distance travelled" by the pointer from the first
3101    iteration in the segment to the last.  Note that it does not include
3102    the size of the access; in effect it only describes the first byte.  */
3103
3104 static tree
3105 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3106 {
3107   length_factor = size_binop (MINUS_EXPR,
3108                               fold_convert (sizetype, length_factor),
3109                               size_one_node);
3110   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3111                      length_factor);
3112 }
3113
3114 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3115    gives the worst-case number of bytes covered by the segment.  */
3116
3117 static unsigned HOST_WIDE_INT
3118 vect_vfa_access_size (data_reference *dr)
3119 {
3120   stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3121   tree ref_type = TREE_TYPE (DR_REF (dr));
3122   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3123   unsigned HOST_WIDE_INT access_size = ref_size;
3124   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3125     {
3126       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3127       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3128     }
3129   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3130       && (vect_supportable_dr_alignment (dr, false)
3131           == dr_explicit_realign_optimized))
3132     {
3133       /* We might access a full vector's worth.  */
3134       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3135       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3136     }
3137   return access_size;
3138 }
3139
3140 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3141
3142 static unsigned int
3143 vect_vfa_align (const data_reference *dr)
3144 {
3145   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3146 }
3147
3148 /* Function vect_no_alias_p.
3149
3150    Given data references A and B with equal base and offset, see whether
3151    the alias relation can be decided at compilation time.  Return 1 if
3152    it can and the references alias, 0 if it can and the references do
3153    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3154    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3155    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3156
3157 static int
3158 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3159                          tree segment_length_a, tree segment_length_b,
3160                          unsigned HOST_WIDE_INT access_size_a,
3161                          unsigned HOST_WIDE_INT access_size_b)
3162 {
3163   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3164   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3165   poly_uint64 const_length_a;
3166   poly_uint64 const_length_b;
3167
3168   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3169      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3170      [a, a+12) */
3171   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3172     {
3173       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3174       offset_a = (offset_a + access_size_a) - const_length_a;
3175     }
3176   else
3177     const_length_a = tree_to_poly_uint64 (segment_length_a);
3178   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3179     {
3180       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3181       offset_b = (offset_b + access_size_b) - const_length_b;
3182     }
3183   else
3184     const_length_b = tree_to_poly_uint64 (segment_length_b);
3185
3186   const_length_a += access_size_a;
3187   const_length_b += access_size_b;
3188
3189   if (ranges_known_overlap_p (offset_a, const_length_a,
3190                               offset_b, const_length_b))
3191     return 1;
3192
3193   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3194                                offset_b, const_length_b))
3195     return 0;
3196
3197   return -1;
3198 }
3199
3200 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3201    in DDR is >= VF.  */
3202
3203 static bool
3204 dependence_distance_ge_vf (data_dependence_relation *ddr,
3205                            unsigned int loop_depth, poly_uint64 vf)
3206 {
3207   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3208       || DDR_NUM_DIST_VECTS (ddr) == 0)
3209     return false;
3210
3211   /* If the dependence is exact, we should have limited the VF instead.  */
3212   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3213
3214   unsigned int i;
3215   lambda_vector dist_v;
3216   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3217     {
3218       HOST_WIDE_INT dist = dist_v[loop_depth];
3219       if (dist != 0
3220           && !(dist > 0 && DDR_REVERSED_P (ddr))
3221           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3222         return false;
3223     }
3224
3225   if (dump_enabled_p ())
3226     {
3227       dump_printf_loc (MSG_NOTE, vect_location,
3228                        "dependence distance between ");
3229       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3230       dump_printf (MSG_NOTE,  " and ");
3231       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3232       dump_printf (MSG_NOTE,  " is >= VF\n");
3233     }
3234
3235   return true;
3236 }
3237
3238 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3239
3240 static void
3241 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3242 {
3243   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3244   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3245   dump_printf (dump_kind, ") >= ");
3246   dump_dec (dump_kind, lower_bound.min_value);
3247 }
3248
3249 /* Record that the vectorized loop requires the vec_lower_bound described
3250    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3251
3252 static void
3253 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3254                         poly_uint64 min_value)
3255 {
3256   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3257   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3258     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3259       {
3260         unsigned_p &= lower_bounds[i].unsigned_p;
3261         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3262         if (lower_bounds[i].unsigned_p != unsigned_p
3263             || maybe_lt (lower_bounds[i].min_value, min_value))
3264           {
3265             lower_bounds[i].unsigned_p = unsigned_p;
3266             lower_bounds[i].min_value = min_value;
3267             if (dump_enabled_p ())
3268               {
3269                 dump_printf_loc (MSG_NOTE, vect_location,
3270                                  "updating run-time check to ");
3271                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3272                 dump_printf (MSG_NOTE, "\n");
3273               }
3274           }
3275         return;
3276       }
3277
3278   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3279   if (dump_enabled_p ())
3280     {
3281       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3282       dump_lower_bound (MSG_NOTE, lower_bound);
3283       dump_printf (MSG_NOTE, "\n");
3284     }
3285   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3286 }
3287
3288 /* Return true if it's unlikely that the step of the vectorized form of DR
3289    will span fewer than GAP bytes.  */
3290
3291 static bool
3292 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3293 {
3294   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3295   HOST_WIDE_INT count
3296     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3297   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3298     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3299   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3300 }
3301
3302 /* Return true if we know that there is no alias between DR_A and DR_B
3303    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3304    *LOWER_BOUND_OUT to this N.  */
3305
3306 static bool
3307 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3308                                 poly_uint64 *lower_bound_out)
3309 {
3310   /* Check that there is a constant gap of known sign between DR_A
3311      and DR_B.  */
3312   poly_int64 init_a, init_b;
3313   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3314       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3315       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3316       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3317       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3318       || !ordered_p (init_a, init_b))
3319     return false;
3320
3321   /* Sort DR_A and DR_B by the address they access.  */
3322   if (maybe_lt (init_b, init_a))
3323     {
3324       std::swap (init_a, init_b);
3325       std::swap (dr_a, dr_b);
3326     }
3327
3328   /* If the two accesses could be dependent within a scalar iteration,
3329      make sure that we'd retain their order.  */
3330   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3331       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3332                                          vect_dr_stmt (dr_b)))
3333     return false;
3334
3335   /* There is no alias if abs (DR_STEP) is greater than or equal to
3336      the bytes spanned by the combination of the two accesses.  */
3337   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3338   return true;
3339 }
3340
3341 /* Function vect_prune_runtime_alias_test_list.
3342
3343    Prune a list of ddrs to be tested at run-time by versioning for alias.
3344    Merge several alias checks into one if possible.
3345    Return FALSE if resulting list of ddrs is longer then allowed by
3346    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3347
3348 bool
3349 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3350 {
3351   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3352   hash_set <tree_pair_hash> compared_objects;
3353
3354   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3355   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3356     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3357   vec<vec_object_pair> &check_unequal_addrs
3358     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3359   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3361
3362   ddr_p ddr;
3363   unsigned int i;
3364   tree length_factor;
3365
3366   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3367
3368   /* Step values are irrelevant for aliasing if the number of vector
3369      iterations is equal to the number of scalar iterations (which can
3370      happen for fully-SLP loops).  */
3371   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3372
3373   if (!ignore_step_p)
3374     {
3375       /* Convert the checks for nonzero steps into bound tests.  */
3376       tree value;
3377       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3378         vect_check_lower_bound (loop_vinfo, value, true, 1);
3379     }
3380
3381   if (may_alias_ddrs.is_empty ())
3382     return true;
3383
3384   comp_alias_ddrs.create (may_alias_ddrs.length ());
3385
3386   unsigned int loop_depth
3387     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3388                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3389
3390   /* First, we collect all data ref pairs for aliasing checks.  */
3391   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3392     {
3393       int comp_res;
3394       poly_uint64 lower_bound;
3395       struct data_reference *dr_a, *dr_b;
3396       gimple *dr_group_first_a, *dr_group_first_b;
3397       tree segment_length_a, segment_length_b;
3398       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3399       unsigned int align_a, align_b;
3400       gimple *stmt_a, *stmt_b;
3401
3402       /* Ignore the alias if the VF we chose ended up being no greater
3403          than the dependence distance.  */
3404       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3405         continue;
3406
3407       if (DDR_OBJECT_A (ddr))
3408         {
3409           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3410           if (!compared_objects.add (new_pair))
3411             {
3412               if (dump_enabled_p ())
3413                 {
3414                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3415                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3416                   dump_printf (MSG_NOTE, " and ");
3417                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3418                   dump_printf (MSG_NOTE, " have different addresses\n");
3419                 }
3420               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3421             }
3422           continue;
3423         }
3424
3425       dr_a = DDR_A (ddr);
3426       stmt_a = vect_dr_stmt (DDR_A (ddr));
3427
3428       dr_b = DDR_B (ddr);
3429       stmt_b = vect_dr_stmt (DDR_B (ddr));
3430
3431       /* Skip the pair if inter-iteration dependencies are irrelevant
3432          and intra-iteration dependencies are guaranteed to be honored.  */
3433       if (ignore_step_p
3434           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3435               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3436         {
3437           if (dump_enabled_p ())
3438             {
3439               dump_printf_loc (MSG_NOTE, vect_location,
3440                                "no need for alias check between ");
3441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3442               dump_printf (MSG_NOTE, " and ");
3443               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3444               dump_printf (MSG_NOTE, " when VF is 1\n");
3445             }
3446           continue;
3447         }
3448
3449       /* See whether we can handle the alias using a bounds check on
3450          the step, and whether that's likely to be the best approach.
3451          (It might not be, for example, if the minimum step is much larger
3452          than the number of bytes handled by one vector iteration.)  */
3453       if (!ignore_step_p
3454           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3455           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3456           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3457               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3458         {
3459           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3460           if (dump_enabled_p ())
3461             {
3462               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3463               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3464               dump_printf (MSG_NOTE, " and ");
3465               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3466               dump_printf (MSG_NOTE, " when the step ");
3467               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3468               dump_printf (MSG_NOTE, " is outside ");
3469               if (unsigned_p)
3470                 dump_printf (MSG_NOTE, "[0");
3471               else
3472                 {
3473                   dump_printf (MSG_NOTE, "(");
3474                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3475                 }
3476               dump_printf (MSG_NOTE, ", ");
3477               dump_dec (MSG_NOTE, lower_bound);
3478               dump_printf (MSG_NOTE, ")\n");
3479             }
3480           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3481                                   lower_bound);
3482           continue;
3483         }
3484
3485       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3486       if (dr_group_first_a)
3487         {
3488           stmt_a = dr_group_first_a;
3489           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3490         }
3491
3492       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3493       if (dr_group_first_b)
3494         {
3495           stmt_b = dr_group_first_b;
3496           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3497         }
3498
3499       if (ignore_step_p)
3500         {
3501           segment_length_a = size_zero_node;
3502           segment_length_b = size_zero_node;
3503         }
3504       else
3505         {
3506           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3507             length_factor = scalar_loop_iters;
3508           else
3509             length_factor = size_int (vect_factor);
3510           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3511           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3512         }
3513       access_size_a = vect_vfa_access_size (dr_a);
3514       access_size_b = vect_vfa_access_size (dr_b);
3515       align_a = vect_vfa_align (dr_a);
3516       align_b = vect_vfa_align (dr_b);
3517
3518       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3519                                         DR_BASE_ADDRESS (dr_b));
3520       if (comp_res == 0)
3521         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3522                                           DR_OFFSET (dr_b));
3523
3524       /* See whether the alias is known at compilation time.  */
3525       if (comp_res == 0
3526           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3527           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3528           && poly_int_tree_p (segment_length_a)
3529           && poly_int_tree_p (segment_length_b))
3530         {
3531           int res = vect_compile_time_alias (dr_a, dr_b,
3532                                              segment_length_a,
3533                                              segment_length_b,
3534                                              access_size_a,
3535                                              access_size_b);
3536           if (res >= 0 && dump_enabled_p ())
3537             {
3538               dump_printf_loc (MSG_NOTE, vect_location,
3539                                "can tell at compile time that ");
3540               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3541               dump_printf (MSG_NOTE, " and ");
3542               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3543               if (res == 0)
3544                 dump_printf (MSG_NOTE, " do not alias\n");
3545               else
3546                 dump_printf (MSG_NOTE, " alias\n");
3547             }
3548
3549           if (res == 0)
3550             continue;
3551
3552           if (res == 1)
3553             {
3554               if (dump_enabled_p ())
3555                 dump_printf_loc (MSG_NOTE, vect_location,
3556                                  "not vectorized: compilation time alias.\n");
3557               return false;
3558             }
3559         }
3560
3561       dr_with_seg_len_pair_t dr_with_seg_len_pair
3562         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3563          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3564
3565       /* Canonicalize pairs by sorting the two DR members.  */
3566       if (comp_res > 0)
3567         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3568
3569       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3570     }
3571
3572   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3573
3574   unsigned int count = (comp_alias_ddrs.length ()
3575                         + check_unequal_addrs.length ());
3576
3577   dump_printf_loc (MSG_NOTE, vect_location,
3578                    "improved number of alias checks from %d to %d\n",
3579                    may_alias_ddrs.length (), count);
3580   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3581     {
3582       if (dump_enabled_p ())
3583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584                          "number of versioning for alias "
3585                          "run-time tests exceeds %d "
3586                          "(--param vect-max-version-for-alias-checks)\n",
3587                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3588       return false;
3589     }
3590
3591   return true;
3592 }
3593
3594 /* Check whether we can use an internal function for a gather load
3595    or scatter store.  READ_P is true for loads and false for stores.
3596    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3597    the type of the memory elements being loaded or stored.  OFFSET_BITS
3598    is the number of bits in each scalar offset and OFFSET_SIGN is the
3599    sign of the offset.  SCALE is the amount by which the offset should
3600    be multiplied *after* it has been converted to address width.
3601
3602    Return true if the function is supported, storing the function
3603    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3604
3605 bool
3606 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3607                           tree memory_type, unsigned int offset_bits,
3608                           signop offset_sign, int scale,
3609                           internal_fn *ifn_out, tree *element_type_out)
3610 {
3611   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3612   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3613   if (offset_bits > element_bits)
3614     /* Internal functions require the offset to be the same width as
3615        the vector elements.  We can extend narrower offsets, but it isn't
3616        safe to truncate wider offsets.  */
3617     return false;
3618
3619   if (element_bits != memory_bits)
3620     /* For now the vector elements must be the same width as the
3621        memory elements.  */
3622     return false;
3623
3624   /* Work out which function we need.  */
3625   internal_fn ifn;
3626   if (read_p)
3627     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3628   else
3629     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3630
3631   /* Test whether the target supports this combination.  */
3632   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3633                                                offset_sign, scale))
3634     return false;
3635
3636   *ifn_out = ifn;
3637   *element_type_out = TREE_TYPE (vectype);
3638   return true;
3639 }
3640
3641 /* CALL is a call to an internal gather load or scatter store function.
3642    Describe the operation in INFO.  */
3643
3644 static void
3645 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3646 {
3647   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3648   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3649   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3650
3651   info->ifn = gimple_call_internal_fn (call);
3652   info->decl = NULL_TREE;
3653   info->base = gimple_call_arg (call, 0);
3654   info->offset = gimple_call_arg (call, 1);
3655   info->offset_dt = vect_unknown_def_type;
3656   info->offset_vectype = NULL_TREE;
3657   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3658   info->element_type = TREE_TYPE (vectype);
3659   info->memory_type = TREE_TYPE (DR_REF (dr));
3660 }
3661
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663    gather load or scatter store.  Describe the operation in *INFO if so.  */
3664
3665 bool
3666 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3667                            gather_scatter_info *info)
3668 {
3669   HOST_WIDE_INT scale = 1;
3670   poly_int64 pbitpos, pbitsize;
3671   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3672   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3673   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674   tree offtype = NULL_TREE;
3675   tree decl = NULL_TREE, base, off;
3676   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3677   tree memory_type = TREE_TYPE (DR_REF (dr));
3678   machine_mode pmode;
3679   int punsignedp, reversep, pvolatilep = 0;
3680   internal_fn ifn;
3681   tree element_type;
3682   bool masked_p = false;
3683
3684   /* See whether this is already a call to a gather/scatter internal function.
3685      If not, see whether it's a masked load or store.  */
3686   gcall *call = dyn_cast <gcall *> (stmt);
3687   if (call && gimple_call_internal_p (call))
3688     {
3689       ifn = gimple_call_internal_fn (stmt);
3690       if (internal_gather_scatter_fn_p (ifn))
3691         {
3692           vect_describe_gather_scatter_call (call, info);
3693           return true;
3694         }
3695       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3696     }
3697
3698   /* True if we should aim to use internal functions rather than
3699      built-in functions.  */
3700   bool use_ifn_p = (DR_IS_READ (dr)
3701                     ? supports_vec_gather_load_p ()
3702                     : supports_vec_scatter_store_p ());
3703
3704   base = DR_REF (dr);
3705   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706      see if we can use the def stmt of the address.  */
3707   if (masked_p
3708       && TREE_CODE (base) == MEM_REF
3709       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3710       && integer_zerop (TREE_OPERAND (base, 1))
3711       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3712     {
3713       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3714       if (is_gimple_assign (def_stmt)
3715           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3716         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3717     }
3718
3719   /* The gather and scatter builtins need address of the form
3720      loop_invariant + vector * {1, 2, 4, 8}
3721      or
3722      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725      multiplications and additions in it.  To get a vector, we need
3726      a single SSA_NAME that will be defined in the loop and will
3727      contain everything that is not loop invariant and that can be
3728      vectorized.  The following code attempts to find such a preexistng
3729      SSA_NAME OFF and put the loop invariants into a tree BASE
3730      that can be gimplified before the loop.  */
3731   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3732                               &punsignedp, &reversep, &pvolatilep);
3733   gcc_assert (base && !reversep);
3734   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3735
3736   if (TREE_CODE (base) == MEM_REF)
3737     {
3738       if (!integer_zerop (TREE_OPERAND (base, 1)))
3739         {
3740           if (off == NULL_TREE)
3741             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3742           else
3743             off = size_binop (PLUS_EXPR, off,
3744                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3745         }
3746       base = TREE_OPERAND (base, 0);
3747     }
3748   else
3749     base = build_fold_addr_expr (base);
3750
3751   if (off == NULL_TREE)
3752     off = size_zero_node;
3753
3754   /* If base is not loop invariant, either off is 0, then we start with just
3755      the constant offset in the loop invariant BASE and continue with base
3756      as OFF, otherwise give up.
3757      We could handle that case by gimplifying the addition of base + off
3758      into some SSA_NAME and use that as off, but for now punt.  */
3759   if (!expr_invariant_in_loop_p (loop, base))
3760     {
3761       if (!integer_zerop (off))
3762         return false;
3763       off = base;
3764       base = size_int (pbytepos);
3765     }
3766   /* Otherwise put base + constant offset into the loop invariant BASE
3767      and continue with OFF.  */
3768   else
3769     {
3770       base = fold_convert (sizetype, base);
3771       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3772     }
3773
3774   /* OFF at this point may be either a SSA_NAME or some tree expression
3775      from get_inner_reference.  Try to peel off loop invariants from it
3776      into BASE as long as possible.  */
3777   STRIP_NOPS (off);
3778   while (offtype == NULL_TREE)
3779     {
3780       enum tree_code code;
3781       tree op0, op1, add = NULL_TREE;
3782
3783       if (TREE_CODE (off) == SSA_NAME)
3784         {
3785           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3786
3787           if (expr_invariant_in_loop_p (loop, off))
3788             return false;
3789
3790           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3791             break;
3792
3793           op0 = gimple_assign_rhs1 (def_stmt);
3794           code = gimple_assign_rhs_code (def_stmt);
3795           op1 = gimple_assign_rhs2 (def_stmt);
3796         }
3797       else
3798         {
3799           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3800             return false;
3801           code = TREE_CODE (off);
3802           extract_ops_from_tree (off, &code, &op0, &op1);
3803         }
3804       switch (code)
3805         {
3806         case POINTER_PLUS_EXPR:
3807         case PLUS_EXPR:
3808           if (expr_invariant_in_loop_p (loop, op0))
3809             {
3810               add = op0;
3811               off = op1;
3812             do_add:
3813               add = fold_convert (sizetype, add);
3814               if (scale != 1)
3815                 add = size_binop (MULT_EXPR, add, size_int (scale));
3816               base = size_binop (PLUS_EXPR, base, add);
3817               continue;
3818             }
3819           if (expr_invariant_in_loop_p (loop, op1))
3820             {
3821               add = op1;
3822               off = op0;
3823               goto do_add;
3824             }
3825           break;
3826         case MINUS_EXPR:
3827           if (expr_invariant_in_loop_p (loop, op1))
3828             {
3829               add = fold_convert (sizetype, op1);
3830               add = size_binop (MINUS_EXPR, size_zero_node, add);
3831               off = op0;
3832               goto do_add;
3833             }
3834           break;
3835         case MULT_EXPR:
3836           if (scale == 1 && tree_fits_shwi_p (op1))
3837             {
3838               int new_scale = tree_to_shwi (op1);
3839               /* Only treat this as a scaling operation if the target
3840                  supports it.  */
3841               if (use_ifn_p
3842                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3843                                                 vectype, memory_type, 1,
3844                                                 TYPE_SIGN (TREE_TYPE (op0)),
3845                                                 new_scale, &ifn,
3846                                                 &element_type))
3847                 break;
3848               scale = new_scale;
3849               off = op0;
3850               continue;
3851             }
3852           break;
3853         case SSA_NAME:
3854           off = op0;
3855           continue;
3856         CASE_CONVERT:
3857           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3858               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3859             break;
3860           if (TYPE_PRECISION (TREE_TYPE (op0))
3861               == TYPE_PRECISION (TREE_TYPE (off)))
3862             {
3863               off = op0;
3864               continue;
3865             }
3866
3867           /* The internal functions need the offset to be the same width
3868              as the elements of VECTYPE.  Don't include operations that
3869              cast the offset from that width to a different width.  */
3870           if (use_ifn_p
3871               && (int_size_in_bytes (TREE_TYPE (vectype))
3872                   == int_size_in_bytes (TREE_TYPE (off))))
3873             break;
3874
3875           if (TYPE_PRECISION (TREE_TYPE (op0))
3876               < TYPE_PRECISION (TREE_TYPE (off)))
3877             {
3878               off = op0;
3879               offtype = TREE_TYPE (off);
3880               STRIP_NOPS (off);
3881               continue;
3882             }
3883           break;
3884         default:
3885           break;
3886         }
3887       break;
3888     }
3889
3890   /* If at the end OFF still isn't a SSA_NAME or isn't
3891      defined in the loop, punt.  */
3892   if (TREE_CODE (off) != SSA_NAME
3893       || expr_invariant_in_loop_p (loop, off))
3894     return false;
3895
3896   if (offtype == NULL_TREE)
3897     offtype = TREE_TYPE (off);
3898
3899   if (use_ifn_p)
3900     {
3901       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3902                                      memory_type, TYPE_PRECISION (offtype),
3903                                      TYPE_SIGN (offtype), scale, &ifn,
3904                                      &element_type))
3905         return false;
3906     }
3907   else
3908     {
3909       if (DR_IS_READ (dr))
3910         {
3911           if (targetm.vectorize.builtin_gather)
3912             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3913         }
3914       else
3915         {
3916           if (targetm.vectorize.builtin_scatter)
3917             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3918         }
3919
3920       if (!decl)
3921         return false;
3922
3923       ifn = IFN_LAST;
3924       element_type = TREE_TYPE (vectype);
3925     }
3926
3927   info->ifn = ifn;
3928   info->decl = decl;
3929   info->base = base;
3930   info->offset = off;
3931   info->offset_dt = vect_unknown_def_type;
3932   info->offset_vectype = NULL_TREE;
3933   info->scale = scale;
3934   info->element_type = element_type;
3935   info->memory_type = memory_type;
3936   return true;
3937 }
3938
3939 /* Find the data references in STMT, analyze them with respect to LOOP and
3940    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3941    be handled.  */
3942
3943 bool
3944 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3945                                vec<data_reference_p> *datarefs)
3946 {
3947   /* We can ignore clobbers for dataref analysis - they are removed during
3948      loop vectorization and BB vectorization checks dependences with a
3949      stmt walk.  */
3950   if (gimple_clobber_p (stmt))
3951     return true;
3952
3953   if (gimple_has_volatile_ops (stmt))
3954     {
3955       if (dump_enabled_p ())
3956         {
3957           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3958                            "not vectorized: volatile type ");
3959           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3960         }
3961       return false;
3962     }
3963
3964   if (stmt_can_throw_internal (stmt))
3965     {
3966       if (dump_enabled_p ())
3967         {
3968           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3969                            "not vectorized: statement can throw an "
3970                            "exception ");
3971           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3972         }
3973       return false;
3974     }
3975
3976   auto_vec<data_reference_p, 2> refs;
3977   if (!find_data_references_in_stmt (loop, stmt, &refs))
3978     return false;
3979
3980   if (refs.is_empty ())
3981     return true;
3982
3983   if (refs.length () > 1)
3984     {
3985       if (dump_enabled_p ())
3986         {
3987           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3988                            "not vectorized: more than one data ref "
3989                            "in stmt: ");
3990           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3991         }
3992       return false;
3993     }
3994
3995   if (gcall *call = dyn_cast <gcall *> (stmt))
3996     if (!gimple_call_internal_p (call)
3997         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3998             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3999       {
4000         if (dump_enabled_p ())
4001           {
4002             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4003                              "not vectorized: dr in a call ");
4004             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4005           }
4006         return false;
4007       }
4008
4009   data_reference_p dr = refs.pop ();
4010   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4011       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4012     {
4013       if (dump_enabled_p ())
4014         {
4015           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4016                            "not vectorized: statement is bitfield "
4017                            "access ");
4018           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4019         }
4020       return false;
4021     }
4022
4023   if (DR_BASE_ADDRESS (dr)
4024       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4025     {
4026       if (dump_enabled_p ())
4027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4028                          "not vectorized: base addr of dr is a "
4029                          "constant\n");
4030       return false;
4031     }
4032
4033   datarefs->safe_push (dr);
4034   return true;
4035 }
4036
4037 /* Function vect_analyze_data_refs.
4038
4039   Find all the data references in the loop or basic block.
4040
4041    The general structure of the analysis of data refs in the vectorizer is as
4042    follows:
4043    1- vect_analyze_data_refs(loop/bb): call
4044       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4045       in the loop/bb and their dependences.
4046    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4047    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4048    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4049
4050 */
4051
4052 bool
4053 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4054 {
4055   struct loop *loop = NULL;
4056   unsigned int i;
4057   struct data_reference *dr;
4058   tree scalar_type;
4059
4060   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4061
4062   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4063     loop = LOOP_VINFO_LOOP (loop_vinfo);
4064
4065   /* Go through the data-refs, check that the analysis succeeded.  Update
4066      pointer from stmt_vec_info struct to DR and vectype.  */
4067
4068   vec<data_reference_p> datarefs = vinfo->datarefs;
4069   FOR_EACH_VEC_ELT (datarefs, i, dr)
4070     {
4071       gimple *stmt;
4072       stmt_vec_info stmt_info;
4073       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4074       bool simd_lane_access = false;
4075       poly_uint64 vf;
4076
4077       gcc_assert (DR_REF (dr));
4078       stmt = vect_dr_stmt (dr);
4079       stmt_info = vinfo_for_stmt (stmt);
4080
4081       /* Check that analysis of the data-ref succeeded.  */
4082       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4083           || !DR_STEP (dr))
4084         {
4085           bool maybe_gather
4086             = DR_IS_READ (dr)
4087               && !TREE_THIS_VOLATILE (DR_REF (dr))
4088               && (targetm.vectorize.builtin_gather != NULL
4089                   || supports_vec_gather_load_p ());
4090           bool maybe_scatter
4091             = DR_IS_WRITE (dr)
4092               && !TREE_THIS_VOLATILE (DR_REF (dr))
4093               && (targetm.vectorize.builtin_scatter != NULL
4094                   || supports_vec_scatter_store_p ());
4095           bool maybe_simd_lane_access
4096             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4097
4098           /* If target supports vector gather loads or scatter stores, or if
4099              this might be a SIMD lane access, see if they can't be used.  */
4100           if (is_a <loop_vec_info> (vinfo)
4101               && !nested_in_vect_loop_p (loop, stmt))
4102             {
4103               if (maybe_simd_lane_access)
4104                 {
4105                   struct data_reference *newdr
4106                     = create_data_ref (NULL, loop_containing_stmt (stmt),
4107                                        DR_REF (dr), stmt, !maybe_scatter,
4108                                        DR_IS_CONDITIONAL_IN_STMT (dr));
4109                   gcc_assert (newdr != NULL && DR_REF (newdr));
4110                   if (DR_BASE_ADDRESS (newdr)
4111                       && DR_OFFSET (newdr)
4112                       && DR_INIT (newdr)
4113                       && DR_STEP (newdr)
4114                       && integer_zerop (DR_STEP (newdr)))
4115                     {
4116                       tree off = DR_OFFSET (newdr);
4117                       STRIP_NOPS (off);
4118                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4119                           && TREE_CODE (off) == MULT_EXPR
4120                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4121                         {
4122                           tree step = TREE_OPERAND (off, 1);
4123                           off = TREE_OPERAND (off, 0);
4124                           STRIP_NOPS (off);
4125                           if (CONVERT_EXPR_P (off)
4126                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4127                                                                           0)))
4128                               < TYPE_PRECISION (TREE_TYPE (off)))
4129                             off = TREE_OPERAND (off, 0);
4130                           if (TREE_CODE (off) == SSA_NAME)
4131                             {
4132                               gimple *def = SSA_NAME_DEF_STMT (off);
4133                               tree reft = TREE_TYPE (DR_REF (newdr));
4134                               if (is_gimple_call (def)
4135                                   && gimple_call_internal_p (def)
4136                                   && (gimple_call_internal_fn (def)
4137                                       == IFN_GOMP_SIMD_LANE))
4138                                 {
4139                                   tree arg = gimple_call_arg (def, 0);
4140                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4141                                   arg = SSA_NAME_VAR (arg);
4142                                   if (arg == loop->simduid
4143                                       /* For now.  */
4144                                       && tree_int_cst_equal
4145                                       (TYPE_SIZE_UNIT (reft),
4146                                        step))
4147                                     {
4148                                       DR_OFFSET (newdr) = ssize_int (0);
4149                                       DR_STEP (newdr) = step;
4150                                       DR_OFFSET_ALIGNMENT (newdr)
4151                                           = BIGGEST_ALIGNMENT;
4152                                       DR_STEP_ALIGNMENT (newdr)
4153                                           = highest_pow2_factor (step);
4154                                       dr = newdr;
4155                                       simd_lane_access = true;
4156                                     }
4157                                 }
4158                             }
4159                         }
4160                     }
4161                   if (!simd_lane_access)
4162                     free_data_ref (newdr);
4163                 }
4164               if (!simd_lane_access && (maybe_gather || maybe_scatter))
4165                 {
4166                   if (maybe_gather)
4167                     gatherscatter = GATHER;
4168                   else
4169                     gatherscatter = SCATTER;
4170                 }
4171             }
4172
4173           if (gatherscatter == SG_NONE && !simd_lane_access)
4174             {
4175               if (dump_enabled_p ())
4176                 {
4177                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4178                                    "not vectorized: data ref analysis "
4179                                    "failed ");
4180                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4181                 }
4182               if (is_a <bb_vec_info> (vinfo))
4183                 {
4184                   /* In BB vectorization the ref can still participate
4185                      in dependence analysis, we just can't vectorize it.  */
4186                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4187                   continue;
4188                 }
4189               return false;
4190             }
4191         }
4192
4193       tree base = get_base_address (DR_REF (dr));
4194       if (base && VAR_P (base) && DECL_NONALIASED (base))
4195         {
4196           if (dump_enabled_p ())
4197             {
4198               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199                                "not vectorized: base object not addressable "
4200                                "for stmt: ");
4201               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4202             }
4203           if (is_a <bb_vec_info> (vinfo))
4204             {
4205               /* In BB vectorization the ref can still participate
4206                  in dependence analysis, we just can't vectorize it.  */
4207               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4208               continue;
4209             }
4210           return false;
4211         }
4212
4213       if (is_a <loop_vec_info> (vinfo)
4214           && DR_STEP (dr)
4215           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4216         {
4217           if (nested_in_vect_loop_p (loop, stmt))
4218             {
4219               if (dump_enabled_p ())
4220                 {
4221                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4222                                    "not vectorized: not suitable for strided "
4223                                    "load ");
4224                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4225                 }
4226               return false;
4227             }
4228           STMT_VINFO_STRIDED_P (stmt_info) = true;
4229         }
4230
4231       /* Update DR field in stmt_vec_info struct.  */
4232
4233       /* If the dataref is in an inner-loop of the loop that is considered for
4234          for vectorization, we also want to analyze the access relative to
4235          the outer-loop (DR contains information only relative to the
4236          inner-most enclosing loop).  We do that by building a reference to the
4237          first location accessed by the inner-loop, and analyze it relative to
4238          the outer-loop.  */
4239       if (loop && nested_in_vect_loop_p (loop, stmt))
4240         {
4241           /* Build a reference to the first location accessed by the
4242              inner loop: *(BASE + INIT + OFFSET).  By construction,
4243              this address must be invariant in the inner loop, so we
4244              can consider it as being used in the outer loop.  */
4245           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4246           tree offset = unshare_expr (DR_OFFSET (dr));
4247           tree init = unshare_expr (DR_INIT (dr));
4248           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4249                                           init, offset);
4250           tree init_addr = fold_build_pointer_plus (base, init_offset);
4251           tree init_ref = build_fold_indirect_ref (init_addr);
4252
4253           if (dump_enabled_p ())
4254             {
4255               dump_printf_loc (MSG_NOTE, vect_location,
4256                                "analyze in outer loop: ");
4257               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4258               dump_printf (MSG_NOTE, "\n");
4259             }
4260
4261           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4262                                      init_ref, loop))
4263             /* dr_analyze_innermost already explained the failure.  */
4264             return false;
4265
4266           if (dump_enabled_p ())
4267             {
4268               dump_printf_loc (MSG_NOTE, vect_location,
4269                                "\touter base_address: ");
4270               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4271                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4272               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4273               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4274                                  STMT_VINFO_DR_OFFSET (stmt_info));
4275               dump_printf (MSG_NOTE,
4276                            "\n\touter constant offset from base address: ");
4277               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4278                                  STMT_VINFO_DR_INIT (stmt_info));
4279               dump_printf (MSG_NOTE, "\n\touter step: ");
4280               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4281                                  STMT_VINFO_DR_STEP (stmt_info));
4282               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4283                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4284               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4285                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4286               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4287                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4288               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4289                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4290             }
4291         }
4292
4293       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4294       STMT_VINFO_DATA_REF (stmt_info) = dr;
4295       if (simd_lane_access)
4296         {
4297           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4298           free_data_ref (datarefs[i]);
4299           datarefs[i] = dr;
4300         }
4301
4302       /* Set vectype for STMT.  */
4303       scalar_type = TREE_TYPE (DR_REF (dr));
4304       STMT_VINFO_VECTYPE (stmt_info)
4305         = get_vectype_for_scalar_type (scalar_type);
4306       if (!STMT_VINFO_VECTYPE (stmt_info))
4307         {
4308           if (dump_enabled_p ())
4309             {
4310               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4311                                "not vectorized: no vectype for stmt: ");
4312               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4313               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4314               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4315                                  scalar_type);
4316               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4317             }
4318
4319           if (is_a <bb_vec_info> (vinfo))
4320             {
4321               /* No vector type is fine, the ref can still participate
4322                  in dependence analysis, we just can't vectorize it.  */
4323               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4324               continue;
4325             }
4326
4327           if (simd_lane_access)
4328             {
4329               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4330               free_data_ref (dr);
4331             }
4332           return false;
4333         }
4334       else
4335         {
4336           if (dump_enabled_p ())
4337             {
4338               dump_printf_loc (MSG_NOTE, vect_location,
4339                                "got vectype for stmt: ");
4340               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4341               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4342                                  STMT_VINFO_VECTYPE (stmt_info));
4343               dump_printf (MSG_NOTE, "\n");
4344             }
4345         }
4346
4347       /* Adjust the minimal vectorization factor according to the
4348          vector type.  */
4349       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4350       *min_vf = upper_bound (*min_vf, vf);
4351
4352       if (gatherscatter != SG_NONE)
4353         {
4354           gather_scatter_info gs_info;
4355           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4356                                           &gs_info)
4357               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4358             {
4359               if (dump_enabled_p ())
4360                 {
4361                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4362                                    (gatherscatter == GATHER) ?
4363                                    "not vectorized: not suitable for gather "
4364                                    "load " :
4365                                    "not vectorized: not suitable for scatter "
4366                                    "store ");
4367                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4368                 }
4369               return false;
4370             }
4371           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4372         }
4373     }
4374
4375   /* We used to stop processing and prune the list here.  Verify we no
4376      longer need to.  */
4377   gcc_assert (i == datarefs.length ());
4378
4379   return true;
4380 }
4381
4382
4383 /* Function vect_get_new_vect_var.
4384
4385    Returns a name for a new variable.  The current naming scheme appends the
4386    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4387    the name of vectorizer generated variables, and appends that to NAME if
4388    provided.  */
4389
4390 tree
4391 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4392 {
4393   const char *prefix;
4394   tree new_vect_var;
4395
4396   switch (var_kind)
4397   {
4398   case vect_simple_var:
4399     prefix = "vect";
4400     break;
4401   case vect_scalar_var:
4402     prefix = "stmp";
4403     break;
4404   case vect_mask_var:
4405     prefix = "mask";
4406     break;
4407   case vect_pointer_var:
4408     prefix = "vectp";
4409     break;
4410   default:
4411     gcc_unreachable ();
4412   }
4413
4414   if (name)
4415     {
4416       char* tmp = concat (prefix, "_", name, NULL);
4417       new_vect_var = create_tmp_reg (type, tmp);
4418       free (tmp);
4419     }
4420   else
4421     new_vect_var = create_tmp_reg (type, prefix);
4422
4423   return new_vect_var;
4424 }
4425
4426 /* Like vect_get_new_vect_var but return an SSA name.  */
4427
4428 tree
4429 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4430 {
4431   const char *prefix;
4432   tree new_vect_var;
4433
4434   switch (var_kind)
4435   {
4436   case vect_simple_var:
4437     prefix = "vect";
4438     break;
4439   case vect_scalar_var:
4440     prefix = "stmp";
4441     break;
4442   case vect_pointer_var:
4443     prefix = "vectp";
4444     break;
4445   default:
4446     gcc_unreachable ();
4447   }
4448
4449   if (name)
4450     {
4451       char* tmp = concat (prefix, "_", name, NULL);
4452       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4453       free (tmp);
4454     }
4455   else
4456     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4457
4458   return new_vect_var;
4459 }
4460
4461 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4462
4463 static void
4464 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4465 {
4466   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4467   int misalign = DR_MISALIGNMENT (dr);
4468   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4469     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4470   else
4471     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4472                             DR_TARGET_ALIGNMENT (dr), misalign);
4473 }
4474
4475 /* Function vect_create_addr_base_for_vector_ref.
4476
4477    Create an expression that computes the address of the first memory location
4478    that will be accessed for a data reference.
4479
4480    Input:
4481    STMT: The statement containing the data reference.
4482    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4483    OFFSET: Optional. If supplied, it is be added to the initial address.
4484    LOOP:    Specify relative to which loop-nest should the address be computed.
4485             For example, when the dataref is in an inner-loop nested in an
4486             outer-loop that is now being vectorized, LOOP can be either the
4487             outer-loop, or the inner-loop.  The first memory location accessed
4488             by the following dataref ('in' points to short):
4489
4490                 for (i=0; i<N; i++)
4491                    for (j=0; j<M; j++)
4492                      s += in[i+j]
4493
4494             is as follows:
4495             if LOOP=i_loop:     &in             (relative to i_loop)
4496             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4497    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4498             initial address.  Unlike OFFSET, which is number of elements to
4499             be added, BYTE_OFFSET is measured in bytes.
4500
4501    Output:
4502    1. Return an SSA_NAME whose value is the address of the memory location of
4503       the first vector of the data reference.
4504    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4505       these statement(s) which define the returned SSA_NAME.
4506
4507    FORNOW: We are only handling array accesses with step 1.  */
4508
4509 tree
4510 vect_create_addr_base_for_vector_ref (gimple *stmt,
4511                                       gimple_seq *new_stmt_list,
4512                                       tree offset,
4513                                       tree byte_offset)
4514 {
4515   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4516   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4517   const char *base_name;
4518   tree addr_base;
4519   tree dest;
4520   gimple_seq seq = NULL;
4521   tree vect_ptr_type;
4522   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4523   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4524   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4525
4526   tree data_ref_base = unshare_expr (drb->base_address);
4527   tree base_offset = unshare_expr (drb->offset);
4528   tree init = unshare_expr (drb->init);
4529
4530   if (loop_vinfo)
4531     base_name = get_name (data_ref_base);
4532   else
4533     {
4534       base_offset = ssize_int (0);
4535       init = ssize_int (0);
4536       base_name = get_name (DR_REF (dr));
4537     }
4538
4539   /* Create base_offset */
4540   base_offset = size_binop (PLUS_EXPR,
4541                             fold_convert (sizetype, base_offset),
4542                             fold_convert (sizetype, init));
4543
4544   if (offset)
4545     {
4546       offset = fold_build2 (MULT_EXPR, sizetype,
4547                             fold_convert (sizetype, offset), step);
4548       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4549                                  base_offset, offset);
4550     }
4551   if (byte_offset)
4552     {
4553       byte_offset = fold_convert (sizetype, byte_offset);
4554       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4555                                  base_offset, byte_offset);
4556     }
4557
4558   /* base + base_offset */
4559   if (loop_vinfo)
4560     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4561   else
4562     {
4563       addr_base = build1 (ADDR_EXPR,
4564                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4565                           unshare_expr (DR_REF (dr)));
4566     }
4567
4568   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4569   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4570   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4571   gimple_seq_add_seq (new_stmt_list, seq);
4572
4573   if (DR_PTR_INFO (dr)
4574       && TREE_CODE (addr_base) == SSA_NAME
4575       && !SSA_NAME_PTR_INFO (addr_base))
4576     {
4577       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4578       if (offset || byte_offset)
4579         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4580     }
4581
4582   if (dump_enabled_p ())
4583     {
4584       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4585       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4586       dump_printf (MSG_NOTE, "\n");
4587     }
4588
4589   return addr_base;
4590 }
4591
4592
4593 /* Function vect_create_data_ref_ptr.
4594
4595    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4596    location accessed in the loop by STMT, along with the def-use update
4597    chain to appropriately advance the pointer through the loop iterations.
4598    Also set aliasing information for the pointer.  This pointer is used by
4599    the callers to this function to create a memory reference expression for
4600    vector load/store access.
4601
4602    Input:
4603    1. STMT: a stmt that references memory. Expected to be of the form
4604          GIMPLE_ASSIGN <name, data-ref> or
4605          GIMPLE_ASSIGN <data-ref, name>.
4606    2. AGGR_TYPE: the type of the reference, which should be either a vector
4607         or an array.
4608    3. AT_LOOP: the loop where the vector memref is to be created.
4609    4. OFFSET (optional): an offset to be added to the initial address accessed
4610         by the data-ref in STMT.
4611    5. BSI: location where the new stmts are to be placed if there is no loop
4612    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4613         pointing to the initial address.
4614    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4615         to the initial address accessed by the data-ref in STMT.  This is
4616         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4617         in bytes.
4618    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4619         to the IV during each iteration of the loop.  NULL says to move
4620         by one copy of AGGR_TYPE up or down, depending on the step of the
4621         data reference.
4622
4623    Output:
4624    1. Declare a new ptr to vector_type, and have it point to the base of the
4625       data reference (initial addressed accessed by the data reference).
4626       For example, for vector of type V8HI, the following code is generated:
4627
4628       v8hi *ap;
4629       ap = (v8hi *)initial_address;
4630
4631       if OFFSET is not supplied:
4632          initial_address = &a[init];
4633       if OFFSET is supplied:
4634          initial_address = &a[init + OFFSET];
4635       if BYTE_OFFSET is supplied:
4636          initial_address = &a[init] + BYTE_OFFSET;
4637
4638       Return the initial_address in INITIAL_ADDRESS.
4639
4640    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4641       update the pointer in each iteration of the loop.
4642
4643       Return the increment stmt that updates the pointer in PTR_INCR.
4644
4645    3. Set INV_P to true if the access pattern of the data reference in the
4646       vectorized loop is invariant.  Set it to false otherwise.
4647
4648    4. Return the pointer.  */
4649
4650 tree
4651 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4652                           tree offset, tree *initial_address,
4653                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4654                           bool only_init, bool *inv_p, tree byte_offset,
4655                           tree iv_step)
4656 {
4657   const char *base_name;
4658   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4659   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4660   struct loop *loop = NULL;
4661   bool nested_in_vect_loop = false;
4662   struct loop *containing_loop = NULL;
4663   tree aggr_ptr_type;
4664   tree aggr_ptr;
4665   tree new_temp;
4666   gimple_seq new_stmt_list = NULL;
4667   edge pe = NULL;
4668   basic_block new_bb;
4669   tree aggr_ptr_init;
4670   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4671   tree aptr;
4672   gimple_stmt_iterator incr_gsi;
4673   bool insert_after;
4674   tree indx_before_incr, indx_after_incr;
4675   gimple *incr;
4676   tree step;
4677   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4678
4679   gcc_assert (iv_step != NULL_TREE
4680               || TREE_CODE (aggr_type) == ARRAY_TYPE
4681               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4682
4683   if (loop_vinfo)
4684     {
4685       loop = LOOP_VINFO_LOOP (loop_vinfo);
4686       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4687       containing_loop = (gimple_bb (stmt))->loop_father;
4688       pe = loop_preheader_edge (loop);
4689     }
4690   else
4691     {
4692       gcc_assert (bb_vinfo);
4693       only_init = true;
4694       *ptr_incr = NULL;
4695     }
4696
4697   /* Check the step (evolution) of the load in LOOP, and record
4698      whether it's invariant.  */
4699   step = vect_dr_behavior (dr)->step;
4700   if (integer_zerop (step))
4701     *inv_p = true;
4702   else
4703     *inv_p = false;
4704
4705   /* Create an expression for the first address accessed by this load
4706      in LOOP.  */
4707   base_name = get_name (DR_BASE_ADDRESS (dr));
4708
4709   if (dump_enabled_p ())
4710     {
4711       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4712       dump_printf_loc (MSG_NOTE, vect_location,
4713                        "create %s-pointer variable to type: ",
4714                        get_tree_code_name (TREE_CODE (aggr_type)));
4715       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4716       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4717         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4718       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4719         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4720       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4721         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4722       else
4723         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4724       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4725       dump_printf (MSG_NOTE, "\n");
4726     }
4727
4728   /* (1) Create the new aggregate-pointer variable.
4729      Vector and array types inherit the alias set of their component
4730      type by default so we need to use a ref-all pointer if the data
4731      reference does not conflict with the created aggregated data
4732      reference because it is not addressable.  */
4733   bool need_ref_all = false;
4734   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4735                               get_alias_set (DR_REF (dr))))
4736     need_ref_all = true;
4737   /* Likewise for any of the data references in the stmt group.  */
4738   else if (DR_GROUP_SIZE (stmt_info) > 1)
4739     {
4740       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4741       do
4742         {
4743           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4744           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4745           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4746                                       get_alias_set (DR_REF (sdr))))
4747             {
4748               need_ref_all = true;
4749               break;
4750             }
4751           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4752         }
4753       while (orig_stmt);
4754     }
4755   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4756                                                need_ref_all);
4757   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4758
4759
4760   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4761      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4762      def-use update cycles for the pointer: one relative to the outer-loop
4763      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4764      to the inner-loop (which is the inner-most loop containing the dataref),
4765      and this is done be step (5) below.
4766
4767      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4768      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4769      redundant.  Steps (3),(4) create the following:
4770
4771         vp0 = &base_addr;
4772         LOOP:   vp1 = phi(vp0,vp2)
4773                 ...
4774                 ...
4775                 vp2 = vp1 + step
4776                 goto LOOP
4777
4778      If there is an inner-loop nested in loop, then step (5) will also be
4779      applied, and an additional update in the inner-loop will be created:
4780
4781         vp0 = &base_addr;
4782         LOOP:   vp1 = phi(vp0,vp2)
4783                 ...
4784         inner:     vp3 = phi(vp1,vp4)
4785                    vp4 = vp3 + inner_step
4786                    if () goto inner
4787                 ...
4788                 vp2 = vp1 + step
4789                 if () goto LOOP   */
4790
4791   /* (2) Calculate the initial address of the aggregate-pointer, and set
4792      the aggregate-pointer to point to it before the loop.  */
4793
4794   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4795
4796   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4797                                                    offset, byte_offset);
4798   if (new_stmt_list)
4799     {
4800       if (pe)
4801         {
4802           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4803           gcc_assert (!new_bb);
4804         }
4805       else
4806         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4807     }
4808
4809   *initial_address = new_temp;
4810   aggr_ptr_init = new_temp;
4811
4812   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4813      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4814      inner-loop nested in LOOP (during outer-loop vectorization).  */
4815
4816   /* No update in loop is required.  */
4817   if (only_init && (!loop_vinfo || at_loop == loop))
4818     aptr = aggr_ptr_init;
4819   else
4820     {
4821       if (iv_step == NULL_TREE)
4822         {
4823           /* The step of the aggregate pointer is the type size.  */
4824           iv_step = TYPE_SIZE_UNIT (aggr_type);
4825           /* One exception to the above is when the scalar step of the load in
4826              LOOP is zero. In this case the step here is also zero.  */
4827           if (*inv_p)
4828             iv_step = size_zero_node;
4829           else if (tree_int_cst_sgn (step) == -1)
4830             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4831         }
4832
4833       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4834
4835       create_iv (aggr_ptr_init,
4836                  fold_convert (aggr_ptr_type, iv_step),
4837                  aggr_ptr, loop, &incr_gsi, insert_after,
4838                  &indx_before_incr, &indx_after_incr);
4839       incr = gsi_stmt (incr_gsi);
4840       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4841
4842       /* Copy the points-to information if it exists. */
4843       if (DR_PTR_INFO (dr))
4844         {
4845           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4846           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4847         }
4848       if (ptr_incr)
4849         *ptr_incr = incr;
4850
4851       aptr = indx_before_incr;
4852     }
4853
4854   if (!nested_in_vect_loop || only_init)
4855     return aptr;
4856
4857
4858   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4859      nested in LOOP, if exists.  */
4860
4861   gcc_assert (nested_in_vect_loop);
4862   if (!only_init)
4863     {
4864       standard_iv_increment_position (containing_loop, &incr_gsi,
4865                                       &insert_after);
4866       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4867                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4868                  &indx_after_incr);
4869       incr = gsi_stmt (incr_gsi);
4870       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4871
4872       /* Copy the points-to information if it exists. */
4873       if (DR_PTR_INFO (dr))
4874         {
4875           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4876           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4877         }
4878       if (ptr_incr)
4879         *ptr_incr = incr;
4880
4881       return indx_before_incr;
4882     }
4883   else
4884     gcc_unreachable ();
4885 }
4886
4887
4888 /* Function bump_vector_ptr
4889
4890    Increment a pointer (to a vector type) by vector-size. If requested,
4891    i.e. if PTR-INCR is given, then also connect the new increment stmt
4892    to the existing def-use update-chain of the pointer, by modifying
4893    the PTR_INCR as illustrated below:
4894
4895    The pointer def-use update-chain before this function:
4896                         DATAREF_PTR = phi (p_0, p_2)
4897                         ....
4898         PTR_INCR:       p_2 = DATAREF_PTR + step
4899
4900    The pointer def-use update-chain after this function:
4901                         DATAREF_PTR = phi (p_0, p_2)
4902                         ....
4903                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4904                         ....
4905         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4906
4907    Input:
4908    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4909                  in the loop.
4910    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4911               the loop.  The increment amount across iterations is expected
4912               to be vector_size.
4913    BSI - location where the new update stmt is to be placed.
4914    STMT - the original scalar memory-access stmt that is being vectorized.
4915    BUMP - optional. The offset by which to bump the pointer. If not given,
4916           the offset is assumed to be vector_size.
4917
4918    Output: Return NEW_DATAREF_PTR as illustrated above.
4919
4920 */
4921
4922 tree
4923 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4924                  gimple *stmt, tree bump)
4925 {
4926   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4927   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4928   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4929   tree update = TYPE_SIZE_UNIT (vectype);
4930   gassign *incr_stmt;
4931   ssa_op_iter iter;
4932   use_operand_p use_p;
4933   tree new_dataref_ptr;
4934
4935   if (bump)
4936     update = bump;
4937
4938   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4939     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4940   else
4941     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4942   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4943                                    dataref_ptr, update);
4944   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4945
4946   /* Copy the points-to information if it exists. */
4947   if (DR_PTR_INFO (dr))
4948     {
4949       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4950       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4951     }
4952
4953   if (!ptr_incr)
4954     return new_dataref_ptr;
4955
4956   /* Update the vector-pointer's cross-iteration increment.  */
4957   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4958     {
4959       tree use = USE_FROM_PTR (use_p);
4960
4961       if (use == dataref_ptr)
4962         SET_USE (use_p, new_dataref_ptr);
4963       else
4964         gcc_assert (operand_equal_p (use, update, 0));
4965     }
4966
4967   return new_dataref_ptr;
4968 }
4969
4970
4971 /* Copy memory reference info such as base/clique from the SRC reference
4972    to the DEST MEM_REF.  */
4973
4974 void
4975 vect_copy_ref_info (tree dest, tree src)
4976 {
4977   if (TREE_CODE (dest) != MEM_REF)
4978     return;
4979
4980   tree src_base = src;
4981   while (handled_component_p (src_base))
4982     src_base = TREE_OPERAND (src_base, 0);
4983   if (TREE_CODE (src_base) != MEM_REF
4984       && TREE_CODE (src_base) != TARGET_MEM_REF)
4985     return;
4986
4987   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4988   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4989 }
4990
4991
4992 /* Function vect_create_destination_var.
4993
4994    Create a new temporary of type VECTYPE.  */
4995
4996 tree
4997 vect_create_destination_var (tree scalar_dest, tree vectype)
4998 {
4999   tree vec_dest;
5000   const char *name;
5001   char *new_name;
5002   tree type;
5003   enum vect_var_kind kind;
5004
5005   kind = vectype
5006     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5007     ? vect_mask_var
5008     : vect_simple_var
5009     : vect_scalar_var;
5010   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5011
5012   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5013
5014   name = get_name (scalar_dest);
5015   if (name)
5016     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5017   else
5018     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5019   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5020   free (new_name);
5021
5022   return vec_dest;
5023 }
5024
5025 /* Function vect_grouped_store_supported.
5026
5027    Returns TRUE if interleave high and interleave low permutations
5028    are supported, and FALSE otherwise.  */
5029
5030 bool
5031 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5032 {
5033   machine_mode mode = TYPE_MODE (vectype);
5034
5035   /* vect_permute_store_chain requires the group size to be equal to 3 or
5036      be a power of two.  */
5037   if (count != 3 && exact_log2 (count) == -1)
5038     {
5039       if (dump_enabled_p ())
5040         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5041                          "the size of the group of accesses"
5042                          " is not a power of 2 or not eqaul to 3\n");
5043       return false;
5044     }
5045
5046   /* Check that the permutation is supported.  */
5047   if (VECTOR_MODE_P (mode))
5048     {
5049       unsigned int i;
5050       if (count == 3)
5051         {
5052           unsigned int j0 = 0, j1 = 0, j2 = 0;
5053           unsigned int i, j;
5054
5055           unsigned int nelt;
5056           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5057             {
5058               if (dump_enabled_p ())
5059                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060                                  "cannot handle groups of 3 stores for"
5061                                  " variable-length vectors\n");
5062               return false;
5063             }
5064
5065           vec_perm_builder sel (nelt, nelt, 1);
5066           sel.quick_grow (nelt);
5067           vec_perm_indices indices;
5068           for (j = 0; j < 3; j++)
5069             {
5070               int nelt0 = ((3 - j) * nelt) % 3;
5071               int nelt1 = ((3 - j) * nelt + 1) % 3;
5072               int nelt2 = ((3 - j) * nelt + 2) % 3;
5073               for (i = 0; i < nelt; i++)
5074                 {
5075                   if (3 * i + nelt0 < nelt)
5076                     sel[3 * i + nelt0] = j0++;
5077                   if (3 * i + nelt1 < nelt)
5078                     sel[3 * i + nelt1] = nelt + j1++;
5079                   if (3 * i + nelt2 < nelt)
5080                     sel[3 * i + nelt2] = 0;
5081                 }
5082               indices.new_vector (sel, 2, nelt);
5083               if (!can_vec_perm_const_p (mode, indices))
5084                 {
5085                   if (dump_enabled_p ())
5086                     dump_printf (MSG_MISSED_OPTIMIZATION,
5087                                  "permutation op not supported by target.\n");
5088                   return false;
5089                 }
5090
5091               for (i = 0; i < nelt; i++)
5092                 {
5093                   if (3 * i + nelt0 < nelt)
5094                     sel[3 * i + nelt0] = 3 * i + nelt0;
5095                   if (3 * i + nelt1 < nelt)
5096                     sel[3 * i + nelt1] = 3 * i + nelt1;
5097                   if (3 * i + nelt2 < nelt)
5098                     sel[3 * i + nelt2] = nelt + j2++;
5099                 }
5100               indices.new_vector (sel, 2, nelt);
5101               if (!can_vec_perm_const_p (mode, indices))
5102                 {
5103                   if (dump_enabled_p ())
5104                     dump_printf (MSG_MISSED_OPTIMIZATION,
5105                                  "permutation op not supported by target.\n");
5106                   return false;
5107                 }
5108             }
5109           return true;
5110         }
5111       else
5112         {
5113           /* If length is not equal to 3 then only power of 2 is supported.  */
5114           gcc_assert (pow2p_hwi (count));
5115           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5116
5117           /* The encoding has 2 interleaved stepped patterns.  */
5118           vec_perm_builder sel (nelt, 2, 3);
5119           sel.quick_grow (6);
5120           for (i = 0; i < 3; i++)
5121             {
5122               sel[i * 2] = i;
5123               sel[i * 2 + 1] = i + nelt;
5124             }
5125           vec_perm_indices indices (sel, 2, nelt);
5126           if (can_vec_perm_const_p (mode, indices))
5127             {
5128               for (i = 0; i < 6; i++)
5129                 sel[i] += exact_div (nelt, 2);
5130               indices.new_vector (sel, 2, nelt);
5131               if (can_vec_perm_const_p (mode, indices))
5132                 return true;
5133             }
5134         }
5135     }
5136
5137   if (dump_enabled_p ())
5138     dump_printf (MSG_MISSED_OPTIMIZATION,
5139                  "permutaion op not supported by target.\n");
5140   return false;
5141 }
5142
5143
5144 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5145    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5146
5147 bool
5148 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5149                             bool masked_p)
5150 {
5151   if (masked_p)
5152     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5153                                          vec_mask_store_lanes_optab,
5154                                          vectype, count);
5155   else
5156     return vect_lanes_optab_supported_p ("vec_store_lanes",
5157                                          vec_store_lanes_optab,
5158                                          vectype, count);
5159 }
5160
5161
5162 /* Function vect_permute_store_chain.
5163
5164    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5165    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5166    the data correctly for the stores.  Return the final references for stores
5167    in RESULT_CHAIN.
5168
5169    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5170    The input is 4 vectors each containing 8 elements.  We assign a number to
5171    each element, the input sequence is:
5172
5173    1st vec:   0  1  2  3  4  5  6  7
5174    2nd vec:   8  9 10 11 12 13 14 15
5175    3rd vec:  16 17 18 19 20 21 22 23
5176    4th vec:  24 25 26 27 28 29 30 31
5177
5178    The output sequence should be:
5179
5180    1st vec:  0  8 16 24  1  9 17 25
5181    2nd vec:  2 10 18 26  3 11 19 27
5182    3rd vec:  4 12 20 28  5 13 21 30
5183    4th vec:  6 14 22 30  7 15 23 31
5184
5185    i.e., we interleave the contents of the four vectors in their order.
5186
5187    We use interleave_high/low instructions to create such output.  The input of
5188    each interleave_high/low operation is two vectors:
5189    1st vec    2nd vec
5190    0 1 2 3    4 5 6 7
5191    the even elements of the result vector are obtained left-to-right from the
5192    high/low elements of the first vector.  The odd elements of the result are
5193    obtained left-to-right from the high/low elements of the second vector.
5194    The output of interleave_high will be:   0 4 1 5
5195    and of interleave_low:                   2 6 3 7
5196
5197
5198    The permutation is done in log LENGTH stages.  In each stage interleave_high
5199    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5200    where the first argument is taken from the first half of DR_CHAIN and the
5201    second argument from it's second half.
5202    In our example,
5203
5204    I1: interleave_high (1st vec, 3rd vec)
5205    I2: interleave_low (1st vec, 3rd vec)
5206    I3: interleave_high (2nd vec, 4th vec)
5207    I4: interleave_low (2nd vec, 4th vec)
5208
5209    The output for the first stage is:
5210
5211    I1:  0 16  1 17  2 18  3 19
5212    I2:  4 20  5 21  6 22  7 23
5213    I3:  8 24  9 25 10 26 11 27
5214    I4: 12 28 13 29 14 30 15 31
5215
5216    The output of the second stage, i.e. the final result is:
5217
5218    I1:  0  8 16 24  1  9 17 25
5219    I2:  2 10 18 26  3 11 19 27
5220    I3:  4 12 20 28  5 13 21 30
5221    I4:  6 14 22 30  7 15 23 31.  */
5222
5223 void
5224 vect_permute_store_chain (vec<tree> dr_chain,
5225                           unsigned int length,
5226                           gimple *stmt,
5227                           gimple_stmt_iterator *gsi,
5228                           vec<tree> *result_chain)
5229 {
5230   tree vect1, vect2, high, low;
5231   gimple *perm_stmt;
5232   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5233   tree perm_mask_low, perm_mask_high;
5234   tree data_ref;
5235   tree perm3_mask_low, perm3_mask_high;
5236   unsigned int i, j, n, log_length = exact_log2 (length);
5237
5238   result_chain->quick_grow (length);
5239   memcpy (result_chain->address (), dr_chain.address (),
5240           length * sizeof (tree));
5241
5242   if (length == 3)
5243     {
5244       /* vect_grouped_store_supported ensures that this is constant.  */
5245       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5246       unsigned int j0 = 0, j1 = 0, j2 = 0;
5247
5248       vec_perm_builder sel (nelt, nelt, 1);
5249       sel.quick_grow (nelt);
5250       vec_perm_indices indices;
5251       for (j = 0; j < 3; j++)
5252         {
5253           int nelt0 = ((3 - j) * nelt) % 3;
5254           int nelt1 = ((3 - j) * nelt + 1) % 3;
5255           int nelt2 = ((3 - j) * nelt + 2) % 3;
5256
5257           for (i = 0; i < nelt; i++)
5258             {
5259               if (3 * i + nelt0 < nelt)
5260                 sel[3 * i + nelt0] = j0++;
5261               if (3 * i + nelt1 < nelt)
5262                 sel[3 * i + nelt1] = nelt + j1++;
5263               if (3 * i + nelt2 < nelt)
5264                 sel[3 * i + nelt2] = 0;
5265             }
5266           indices.new_vector (sel, 2, nelt);
5267           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5268
5269           for (i = 0; i < nelt; i++)
5270             {
5271               if (3 * i + nelt0 < nelt)
5272                 sel[3 * i + nelt0] = 3 * i + nelt0;
5273               if (3 * i + nelt1 < nelt)
5274                 sel[3 * i + nelt1] = 3 * i + nelt1;
5275               if (3 * i + nelt2 < nelt)
5276                 sel[3 * i + nelt2] = nelt + j2++;
5277             }
5278           indices.new_vector (sel, 2, nelt);
5279           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5280
5281           vect1 = dr_chain[0];
5282           vect2 = dr_chain[1];
5283
5284           /* Create interleaving stmt:
5285              low = VEC_PERM_EXPR <vect1, vect2,
5286                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5287                                    j + 2, nelt + j + 2, *, ...}>  */
5288           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5289           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5290                                            vect2, perm3_mask_low);
5291           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5292
5293           vect1 = data_ref;
5294           vect2 = dr_chain[2];
5295           /* Create interleaving stmt:
5296              low = VEC_PERM_EXPR <vect1, vect2,
5297                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5298                                    6, 7, nelt + j + 2, ...}>  */
5299           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5300           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5301                                            vect2, perm3_mask_high);
5302           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5303           (*result_chain)[j] = data_ref;
5304         }
5305     }
5306   else
5307     {
5308       /* If length is not equal to 3 then only power of 2 is supported.  */
5309       gcc_assert (pow2p_hwi (length));
5310
5311       /* The encoding has 2 interleaved stepped patterns.  */
5312       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5313       vec_perm_builder sel (nelt, 2, 3);
5314       sel.quick_grow (6);
5315       for (i = 0; i < 3; i++)
5316         {
5317           sel[i * 2] = i;
5318           sel[i * 2 + 1] = i + nelt;
5319         }
5320         vec_perm_indices indices (sel, 2, nelt);
5321         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5322
5323         for (i = 0; i < 6; i++)
5324           sel[i] += exact_div (nelt, 2);
5325         indices.new_vector (sel, 2, nelt);
5326         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5327
5328         for (i = 0, n = log_length; i < n; i++)
5329           {
5330             for (j = 0; j < length/2; j++)
5331               {
5332                 vect1 = dr_chain[j];
5333                 vect2 = dr_chain[j+length/2];
5334
5335                 /* Create interleaving stmt:
5336                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5337                                                         ...}>  */
5338                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5339                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5340                                                  vect2, perm_mask_high);
5341                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342                 (*result_chain)[2*j] = high;
5343
5344                 /* Create interleaving stmt:
5345                    low = VEC_PERM_EXPR <vect1, vect2,
5346                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5347                                          ...}>  */
5348                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5349                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5350                                                  vect2, perm_mask_low);
5351                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5352                 (*result_chain)[2*j+1] = low;
5353               }
5354             memcpy (dr_chain.address (), result_chain->address (),
5355                     length * sizeof (tree));
5356           }
5357     }
5358 }
5359
5360 /* Function vect_setup_realignment
5361
5362    This function is called when vectorizing an unaligned load using
5363    the dr_explicit_realign[_optimized] scheme.
5364    This function generates the following code at the loop prolog:
5365
5366       p = initial_addr;
5367    x  msq_init = *(floor(p));   # prolog load
5368       realignment_token = call target_builtin;
5369     loop:
5370    x  msq = phi (msq_init, ---)
5371
5372    The stmts marked with x are generated only for the case of
5373    dr_explicit_realign_optimized.
5374
5375    The code above sets up a new (vector) pointer, pointing to the first
5376    location accessed by STMT, and a "floor-aligned" load using that pointer.
5377    It also generates code to compute the "realignment-token" (if the relevant
5378    target hook was defined), and creates a phi-node at the loop-header bb
5379    whose arguments are the result of the prolog-load (created by this
5380    function) and the result of a load that takes place in the loop (to be
5381    created by the caller to this function).
5382
5383    For the case of dr_explicit_realign_optimized:
5384    The caller to this function uses the phi-result (msq) to create the
5385    realignment code inside the loop, and sets up the missing phi argument,
5386    as follows:
5387     loop:
5388       msq = phi (msq_init, lsq)
5389       lsq = *(floor(p'));        # load in loop
5390       result = realign_load (msq, lsq, realignment_token);
5391
5392    For the case of dr_explicit_realign:
5393     loop:
5394       msq = *(floor(p));        # load in loop
5395       p' = p + (VS-1);
5396       lsq = *(floor(p'));       # load in loop
5397       result = realign_load (msq, lsq, realignment_token);
5398
5399    Input:
5400    STMT - (scalar) load stmt to be vectorized. This load accesses
5401           a memory location that may be unaligned.
5402    BSI - place where new code is to be inserted.
5403    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5404                               is used.
5405
5406    Output:
5407    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5408                        target hook, if defined.
5409    Return value - the result of the loop-header phi node.  */
5410
5411 tree
5412 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5413                         tree *realignment_token,
5414                         enum dr_alignment_support alignment_support_scheme,
5415                         tree init_addr,
5416                         struct loop **at_loop)
5417 {
5418   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5419   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5420   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5421   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5422   struct loop *loop = NULL;
5423   edge pe = NULL;
5424   tree scalar_dest = gimple_assign_lhs (stmt);
5425   tree vec_dest;
5426   gimple *inc;
5427   tree ptr;
5428   tree data_ref;
5429   basic_block new_bb;
5430   tree msq_init = NULL_TREE;
5431   tree new_temp;
5432   gphi *phi_stmt;
5433   tree msq = NULL_TREE;
5434   gimple_seq stmts = NULL;
5435   bool inv_p;
5436   bool compute_in_loop = false;
5437   bool nested_in_vect_loop = false;
5438   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5439   struct loop *loop_for_initial_load = NULL;
5440
5441   if (loop_vinfo)
5442     {
5443       loop = LOOP_VINFO_LOOP (loop_vinfo);
5444       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5445     }
5446
5447   gcc_assert (alignment_support_scheme == dr_explicit_realign
5448               || alignment_support_scheme == dr_explicit_realign_optimized);
5449
5450   /* We need to generate three things:
5451      1. the misalignment computation
5452      2. the extra vector load (for the optimized realignment scheme).
5453      3. the phi node for the two vectors from which the realignment is
5454       done (for the optimized realignment scheme).  */
5455
5456   /* 1. Determine where to generate the misalignment computation.
5457
5458      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5459      calculation will be generated by this function, outside the loop (in the
5460      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5461      caller, inside the loop.
5462
5463      Background: If the misalignment remains fixed throughout the iterations of
5464      the loop, then both realignment schemes are applicable, and also the
5465      misalignment computation can be done outside LOOP.  This is because we are
5466      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5467      are a multiple of VS (the Vector Size), and therefore the misalignment in
5468      different vectorized LOOP iterations is always the same.
5469      The problem arises only if the memory access is in an inner-loop nested
5470      inside LOOP, which is now being vectorized using outer-loop vectorization.
5471      This is the only case when the misalignment of the memory access may not
5472      remain fixed throughout the iterations of the inner-loop (as explained in
5473      detail in vect_supportable_dr_alignment).  In this case, not only is the
5474      optimized realignment scheme not applicable, but also the misalignment
5475      computation (and generation of the realignment token that is passed to
5476      REALIGN_LOAD) have to be done inside the loop.
5477
5478      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5479      or not, which in turn determines if the misalignment is computed inside
5480      the inner-loop, or outside LOOP.  */
5481
5482   if (init_addr != NULL_TREE || !loop_vinfo)
5483     {
5484       compute_in_loop = true;
5485       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5486     }
5487
5488
5489   /* 2. Determine where to generate the extra vector load.
5490
5491      For the optimized realignment scheme, instead of generating two vector
5492      loads in each iteration, we generate a single extra vector load in the
5493      preheader of the loop, and in each iteration reuse the result of the
5494      vector load from the previous iteration.  In case the memory access is in
5495      an inner-loop nested inside LOOP, which is now being vectorized using
5496      outer-loop vectorization, we need to determine whether this initial vector
5497      load should be generated at the preheader of the inner-loop, or can be
5498      generated at the preheader of LOOP.  If the memory access has no evolution
5499      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5500      to be generated inside LOOP (in the preheader of the inner-loop).  */
5501
5502   if (nested_in_vect_loop)
5503     {
5504       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5505       bool invariant_in_outerloop =
5506             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5507       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5508     }
5509   else
5510     loop_for_initial_load = loop;
5511   if (at_loop)
5512     *at_loop = loop_for_initial_load;
5513
5514   if (loop_for_initial_load)
5515     pe = loop_preheader_edge (loop_for_initial_load);
5516
5517   /* 3. For the case of the optimized realignment, create the first vector
5518       load at the loop preheader.  */
5519
5520   if (alignment_support_scheme == dr_explicit_realign_optimized)
5521     {
5522       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5523       gassign *new_stmt;
5524
5525       gcc_assert (!compute_in_loop);
5526       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5527       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5528                                       NULL_TREE, &init_addr, NULL, &inc,
5529                                       true, &inv_p);
5530       if (TREE_CODE (ptr) == SSA_NAME)
5531         new_temp = copy_ssa_name (ptr);
5532       else
5533         new_temp = make_ssa_name (TREE_TYPE (ptr));
5534       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5535       new_stmt = gimple_build_assign
5536                    (new_temp, BIT_AND_EXPR, ptr,
5537                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5538       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5539       gcc_assert (!new_bb);
5540       data_ref
5541         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5542                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5543       vect_copy_ref_info (data_ref, DR_REF (dr));
5544       new_stmt = gimple_build_assign (vec_dest, data_ref);
5545       new_temp = make_ssa_name (vec_dest, new_stmt);
5546       gimple_assign_set_lhs (new_stmt, new_temp);
5547       if (pe)
5548         {
5549           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5550           gcc_assert (!new_bb);
5551         }
5552       else
5553          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5554
5555       msq_init = gimple_assign_lhs (new_stmt);
5556     }
5557
5558   /* 4. Create realignment token using a target builtin, if available.
5559       It is done either inside the containing loop, or before LOOP (as
5560       determined above).  */
5561
5562   if (targetm.vectorize.builtin_mask_for_load)
5563     {
5564       gcall *new_stmt;
5565       tree builtin_decl;
5566
5567       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5568       if (!init_addr)
5569         {
5570           /* Generate the INIT_ADDR computation outside LOOP.  */
5571           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5572                                                             NULL_TREE);
5573           if (loop)
5574             {
5575               pe = loop_preheader_edge (loop);
5576               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5577               gcc_assert (!new_bb);
5578             }
5579           else
5580              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5581         }
5582
5583       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5584       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5585       vec_dest =
5586         vect_create_destination_var (scalar_dest,
5587                                      gimple_call_return_type (new_stmt));
5588       new_temp = make_ssa_name (vec_dest, new_stmt);
5589       gimple_call_set_lhs (new_stmt, new_temp);
5590
5591       if (compute_in_loop)
5592         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5593       else
5594         {
5595           /* Generate the misalignment computation outside LOOP.  */
5596           pe = loop_preheader_edge (loop);
5597           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5598           gcc_assert (!new_bb);
5599         }
5600
5601       *realignment_token = gimple_call_lhs (new_stmt);
5602
5603       /* The result of the CALL_EXPR to this builtin is determined from
5604          the value of the parameter and no global variables are touched
5605          which makes the builtin a "const" function.  Requiring the
5606          builtin to have the "const" attribute makes it unnecessary
5607          to call mark_call_clobbered.  */
5608       gcc_assert (TREE_READONLY (builtin_decl));
5609     }
5610
5611   if (alignment_support_scheme == dr_explicit_realign)
5612     return msq;
5613
5614   gcc_assert (!compute_in_loop);
5615   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5616
5617
5618   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5619
5620   pe = loop_preheader_edge (containing_loop);
5621   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5622   msq = make_ssa_name (vec_dest);
5623   phi_stmt = create_phi_node (msq, containing_loop->header);
5624   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5625
5626   return msq;
5627 }
5628
5629
5630 /* Function vect_grouped_load_supported.
5631
5632    COUNT is the size of the load group (the number of statements plus the
5633    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5634    only one statement, with a gap of COUNT - 1.
5635
5636    Returns true if a suitable permute exists.  */
5637
5638 bool
5639 vect_grouped_load_supported (tree vectype, bool single_element_p,
5640                              unsigned HOST_WIDE_INT count)
5641 {
5642   machine_mode mode = TYPE_MODE (vectype);
5643
5644   /* If this is single-element interleaving with an element distance
5645      that leaves unused vector loads around punt - we at least create
5646      very sub-optimal code in that case (and blow up memory,
5647      see PR65518).  */
5648   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5649     {
5650       if (dump_enabled_p ())
5651         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5652                          "single-element interleaving not supported "
5653                          "for not adjacent vector loads\n");
5654       return false;
5655     }
5656
5657   /* vect_permute_load_chain requires the group size to be equal to 3 or
5658      be a power of two.  */
5659   if (count != 3 && exact_log2 (count) == -1)
5660     {
5661       if (dump_enabled_p ())
5662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5663                          "the size of the group of accesses"
5664                          " is not a power of 2 or not equal to 3\n");
5665       return false;
5666     }
5667
5668   /* Check that the permutation is supported.  */
5669   if (VECTOR_MODE_P (mode))
5670     {
5671       unsigned int i, j;
5672       if (count == 3)
5673         {
5674           unsigned int nelt;
5675           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5676             {
5677               if (dump_enabled_p ())
5678                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5679                                  "cannot handle groups of 3 loads for"
5680                                  " variable-length vectors\n");
5681               return false;
5682             }
5683
5684           vec_perm_builder sel (nelt, nelt, 1);
5685           sel.quick_grow (nelt);
5686           vec_perm_indices indices;
5687           unsigned int k;
5688           for (k = 0; k < 3; k++)
5689             {
5690               for (i = 0; i < nelt; i++)
5691                 if (3 * i + k < 2 * nelt)
5692                   sel[i] = 3 * i + k;
5693                 else
5694                   sel[i] = 0;
5695               indices.new_vector (sel, 2, nelt);
5696               if (!can_vec_perm_const_p (mode, indices))
5697                 {
5698                   if (dump_enabled_p ())
5699                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5700                                      "shuffle of 3 loads is not supported by"
5701                                      " target\n");
5702                   return false;
5703                 }
5704               for (i = 0, j = 0; i < nelt; i++)
5705                 if (3 * i + k < 2 * nelt)
5706                   sel[i] = i;
5707                 else
5708                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5709               indices.new_vector (sel, 2, nelt);
5710               if (!can_vec_perm_const_p (mode, indices))
5711                 {
5712                   if (dump_enabled_p ())
5713                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5714                                      "shuffle of 3 loads is not supported by"
5715                                      " target\n");
5716                   return false;
5717                 }
5718             }
5719           return true;
5720         }
5721       else
5722         {
5723           /* If length is not equal to 3 then only power of 2 is supported.  */
5724           gcc_assert (pow2p_hwi (count));
5725           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5726
5727           /* The encoding has a single stepped pattern.  */
5728           vec_perm_builder sel (nelt, 1, 3);
5729           sel.quick_grow (3);
5730           for (i = 0; i < 3; i++)
5731             sel[i] = i * 2;
5732           vec_perm_indices indices (sel, 2, nelt);
5733           if (can_vec_perm_const_p (mode, indices))
5734             {
5735               for (i = 0; i < 3; i++)
5736                 sel[i] = i * 2 + 1;
5737               indices.new_vector (sel, 2, nelt);
5738               if (can_vec_perm_const_p (mode, indices))
5739                 return true;
5740             }
5741         }
5742     }
5743
5744   if (dump_enabled_p ())
5745     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5746                      "extract even/odd not supported by target\n");
5747   return false;
5748 }
5749
5750 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5751    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5752
5753 bool
5754 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5755                            bool masked_p)
5756 {
5757   if (masked_p)
5758     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5759                                          vec_mask_load_lanes_optab,
5760                                          vectype, count);
5761   else
5762     return vect_lanes_optab_supported_p ("vec_load_lanes",
5763                                          vec_load_lanes_optab,
5764                                          vectype, count);
5765 }
5766
5767 /* Function vect_permute_load_chain.
5768
5769    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5770    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5771    the input data correctly.  Return the final references for loads in
5772    RESULT_CHAIN.
5773
5774    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5775    The input is 4 vectors each containing 8 elements. We assign a number to each
5776    element, the input sequence is:
5777
5778    1st vec:   0  1  2  3  4  5  6  7
5779    2nd vec:   8  9 10 11 12 13 14 15
5780    3rd vec:  16 17 18 19 20 21 22 23
5781    4th vec:  24 25 26 27 28 29 30 31
5782
5783    The output sequence should be:
5784
5785    1st vec:  0 4  8 12 16 20 24 28
5786    2nd vec:  1 5  9 13 17 21 25 29
5787    3rd vec:  2 6 10 14 18 22 26 30
5788    4th vec:  3 7 11 15 19 23 27 31
5789
5790    i.e., the first output vector should contain the first elements of each
5791    interleaving group, etc.
5792
5793    We use extract_even/odd instructions to create such output.  The input of
5794    each extract_even/odd operation is two vectors
5795    1st vec    2nd vec
5796    0 1 2 3    4 5 6 7
5797
5798    and the output is the vector of extracted even/odd elements.  The output of
5799    extract_even will be:   0 2 4 6
5800    and of extract_odd:     1 3 5 7
5801
5802
5803    The permutation is done in log LENGTH stages.  In each stage extract_even
5804    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5805    their order.  In our example,
5806
5807    E1: extract_even (1st vec, 2nd vec)
5808    E2: extract_odd (1st vec, 2nd vec)
5809    E3: extract_even (3rd vec, 4th vec)
5810    E4: extract_odd (3rd vec, 4th vec)
5811
5812    The output for the first stage will be:
5813
5814    E1:  0  2  4  6  8 10 12 14
5815    E2:  1  3  5  7  9 11 13 15
5816    E3: 16 18 20 22 24 26 28 30
5817    E4: 17 19 21 23 25 27 29 31
5818
5819    In order to proceed and create the correct sequence for the next stage (or
5820    for the correct output, if the second stage is the last one, as in our
5821    example), we first put the output of extract_even operation and then the
5822    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5823    The input for the second stage is:
5824
5825    1st vec (E1):  0  2  4  6  8 10 12 14
5826    2nd vec (E3): 16 18 20 22 24 26 28 30
5827    3rd vec (E2):  1  3  5  7  9 11 13 15
5828    4th vec (E4): 17 19 21 23 25 27 29 31
5829
5830    The output of the second stage:
5831
5832    E1: 0 4  8 12 16 20 24 28
5833    E2: 2 6 10 14 18 22 26 30
5834    E3: 1 5  9 13 17 21 25 29
5835    E4: 3 7 11 15 19 23 27 31
5836
5837    And RESULT_CHAIN after reordering:
5838
5839    1st vec (E1):  0 4  8 12 16 20 24 28
5840    2nd vec (E3):  1 5  9 13 17 21 25 29
5841    3rd vec (E2):  2 6 10 14 18 22 26 30
5842    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5843
5844 static void
5845 vect_permute_load_chain (vec<tree> dr_chain,
5846                          unsigned int length,
5847                          gimple *stmt,
5848                          gimple_stmt_iterator *gsi,
5849                          vec<tree> *result_chain)
5850 {
5851   tree data_ref, first_vect, second_vect;
5852   tree perm_mask_even, perm_mask_odd;
5853   tree perm3_mask_low, perm3_mask_high;
5854   gimple *perm_stmt;
5855   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5856   unsigned int i, j, log_length = exact_log2 (length);
5857
5858   result_chain->quick_grow (length);
5859   memcpy (result_chain->address (), dr_chain.address (),
5860           length * sizeof (tree));
5861
5862   if (length == 3)
5863     {
5864       /* vect_grouped_load_supported ensures that this is constant.  */
5865       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5866       unsigned int k;
5867
5868       vec_perm_builder sel (nelt, nelt, 1);
5869       sel.quick_grow (nelt);
5870       vec_perm_indices indices;
5871       for (k = 0; k < 3; k++)
5872         {
5873           for (i = 0; i < nelt; i++)
5874             if (3 * i + k < 2 * nelt)
5875               sel[i] = 3 * i + k;
5876             else
5877               sel[i] = 0;
5878           indices.new_vector (sel, 2, nelt);
5879           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5880
5881           for (i = 0, j = 0; i < nelt; i++)
5882             if (3 * i + k < 2 * nelt)
5883               sel[i] = i;
5884             else
5885               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5886           indices.new_vector (sel, 2, nelt);
5887           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5888
5889           first_vect = dr_chain[0];
5890           second_vect = dr_chain[1];
5891
5892           /* Create interleaving stmt (low part of):
5893              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5894                                                              ...}>  */
5895           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5896           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5897                                            second_vect, perm3_mask_low);
5898           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5899
5900           /* Create interleaving stmt (high part of):
5901              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5902                                                               ...}>  */
5903           first_vect = data_ref;
5904           second_vect = dr_chain[2];
5905           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5906           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5907                                            second_vect, perm3_mask_high);
5908           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5909           (*result_chain)[k] = data_ref;
5910         }
5911     }
5912   else
5913     {
5914       /* If length is not equal to 3 then only power of 2 is supported.  */
5915       gcc_assert (pow2p_hwi (length));
5916
5917       /* The encoding has a single stepped pattern.  */
5918       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5919       vec_perm_builder sel (nelt, 1, 3);
5920       sel.quick_grow (3);
5921       for (i = 0; i < 3; ++i)
5922         sel[i] = i * 2;
5923       vec_perm_indices indices (sel, 2, nelt);
5924       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5925
5926       for (i = 0; i < 3; ++i)
5927         sel[i] = i * 2 + 1;
5928       indices.new_vector (sel, 2, nelt);
5929       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5930
5931       for (i = 0; i < log_length; i++)
5932         {
5933           for (j = 0; j < length; j += 2)
5934             {
5935               first_vect = dr_chain[j];
5936               second_vect = dr_chain[j+1];
5937
5938               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5939               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5940               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5941                                                first_vect, second_vect,
5942                                                perm_mask_even);
5943               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5944               (*result_chain)[j/2] = data_ref;
5945
5946               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5947               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5948               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5949                                                first_vect, second_vect,
5950                                                perm_mask_odd);
5951               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5952               (*result_chain)[j/2+length/2] = data_ref;
5953             }
5954           memcpy (dr_chain.address (), result_chain->address (),
5955                   length * sizeof (tree));
5956         }
5957     }
5958 }
5959
5960 /* Function vect_shift_permute_load_chain.
5961
5962    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5963    sequence of stmts to reorder the input data accordingly.
5964    Return the final references for loads in RESULT_CHAIN.
5965    Return true if successed, false otherwise.
5966
5967    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5968    The input is 3 vectors each containing 8 elements.  We assign a
5969    number to each element, the input sequence is:
5970
5971    1st vec:   0  1  2  3  4  5  6  7
5972    2nd vec:   8  9 10 11 12 13 14 15
5973    3rd vec:  16 17 18 19 20 21 22 23
5974
5975    The output sequence should be:
5976
5977    1st vec:  0 3 6  9 12 15 18 21
5978    2nd vec:  1 4 7 10 13 16 19 22
5979    3rd vec:  2 5 8 11 14 17 20 23
5980
5981    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5982
5983    First we shuffle all 3 vectors to get correct elements order:
5984
5985    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5986    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5987    3rd vec:  (16 19 22) (17 20 23) (18 21)
5988
5989    Next we unite and shift vector 3 times:
5990
5991    1st step:
5992      shift right by 6 the concatenation of:
5993      "1st vec" and  "2nd vec"
5994        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5995      "2nd vec" and  "3rd vec"
5996        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5997      "3rd vec" and  "1st vec"
5998        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5999                              | New vectors                   |
6000
6001      So that now new vectors are:
6002
6003      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6004      2nd vec:  (10 13) (16 19 22) (17 20 23)
6005      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6006
6007    2nd step:
6008      shift right by 5 the concatenation of:
6009      "1st vec" and  "3rd vec"
6010        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6011      "2nd vec" and  "1st vec"
6012        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6013      "3rd vec" and  "2nd vec"
6014        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6015                           | New vectors                   |
6016
6017      So that now new vectors are:
6018
6019      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6020      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6021      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6022
6023    3rd step:
6024      shift right by 5 the concatenation of:
6025      "1st vec" and  "1st vec"
6026        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6027      shift right by 3 the concatenation of:
6028      "2nd vec" and  "2nd vec"
6029                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6030                           | New vectors                   |
6031
6032      So that now all vectors are READY:
6033      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6034      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6035      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6036
6037    This algorithm is faster than one in vect_permute_load_chain if:
6038      1.  "shift of a concatination" is faster than general permutation.
6039          This is usually so.
6040      2.  The TARGET machine can't execute vector instructions in parallel.
6041          This is because each step of the algorithm depends on previous.
6042          The algorithm in vect_permute_load_chain is much more parallel.
6043
6044    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6045 */
6046
6047 static bool
6048 vect_shift_permute_load_chain (vec<tree> dr_chain,
6049                                unsigned int length,
6050                                gimple *stmt,
6051                                gimple_stmt_iterator *gsi,
6052                                vec<tree> *result_chain)
6053 {
6054   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6055   tree perm2_mask1, perm2_mask2, perm3_mask;
6056   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6057   gimple *perm_stmt;
6058
6059   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6060   unsigned int i;
6061   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6062   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6063
6064   unsigned HOST_WIDE_INT nelt, vf;
6065   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6066       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6067     /* Not supported for variable-length vectors.  */
6068     return false;
6069
6070   vec_perm_builder sel (nelt, nelt, 1);
6071   sel.quick_grow (nelt);
6072
6073   result_chain->quick_grow (length);
6074   memcpy (result_chain->address (), dr_chain.address (),
6075           length * sizeof (tree));
6076
6077   if (pow2p_hwi (length) && vf > 4)
6078     {
6079       unsigned int j, log_length = exact_log2 (length);
6080       for (i = 0; i < nelt / 2; ++i)
6081         sel[i] = i * 2;
6082       for (i = 0; i < nelt / 2; ++i)
6083         sel[nelt / 2 + i] = i * 2 + 1;
6084       vec_perm_indices indices (sel, 2, nelt);
6085       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6086         {
6087           if (dump_enabled_p ())
6088             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6089                              "shuffle of 2 fields structure is not \
6090                               supported by target\n");
6091           return false;
6092         }
6093       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6094
6095       for (i = 0; i < nelt / 2; ++i)
6096         sel[i] = i * 2 + 1;
6097       for (i = 0; i < nelt / 2; ++i)
6098         sel[nelt / 2 + i] = i * 2;
6099       indices.new_vector (sel, 2, nelt);
6100       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6101         {
6102           if (dump_enabled_p ())
6103             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6104                              "shuffle of 2 fields structure is not \
6105                               supported by target\n");
6106           return false;
6107         }
6108       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6109
6110       /* Generating permutation constant to shift all elements.
6111          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6112       for (i = 0; i < nelt; i++)
6113         sel[i] = nelt / 2 + i;
6114       indices.new_vector (sel, 2, nelt);
6115       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6116         {
6117           if (dump_enabled_p ())
6118             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6119                              "shift permutation is not supported by target\n");
6120           return false;
6121         }
6122       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6123
6124       /* Generating permutation constant to select vector from 2.
6125          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6126       for (i = 0; i < nelt / 2; i++)
6127         sel[i] = i;
6128       for (i = nelt / 2; i < nelt; i++)
6129         sel[i] = nelt + i;
6130       indices.new_vector (sel, 2, nelt);
6131       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6132         {
6133           if (dump_enabled_p ())
6134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135                              "select is not supported by target\n");
6136           return false;
6137         }
6138       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6139
6140       for (i = 0; i < log_length; i++)
6141         {
6142           for (j = 0; j < length; j += 2)
6143             {
6144               first_vect = dr_chain[j];
6145               second_vect = dr_chain[j + 1];
6146
6147               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6148               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6149                                                first_vect, first_vect,
6150                                                perm2_mask1);
6151               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6152               vect[0] = data_ref;
6153
6154               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6155               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6156                                                second_vect, second_vect,
6157                                                perm2_mask2);
6158               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6159               vect[1] = data_ref;
6160
6161               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6162               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6163                                                vect[0], vect[1], shift1_mask);
6164               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6165               (*result_chain)[j/2 + length/2] = data_ref;
6166
6167               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6168               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6169                                                vect[0], vect[1], select_mask);
6170               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171               (*result_chain)[j/2] = data_ref;
6172             }
6173           memcpy (dr_chain.address (), result_chain->address (),
6174                   length * sizeof (tree));
6175         }
6176       return true;
6177     }
6178   if (length == 3 && vf > 2)
6179     {
6180       unsigned int k = 0, l = 0;
6181
6182       /* Generating permutation constant to get all elements in rigth order.
6183          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6184       for (i = 0; i < nelt; i++)
6185         {
6186           if (3 * k + (l % 3) >= nelt)
6187             {
6188               k = 0;
6189               l += (3 - (nelt % 3));
6190             }
6191           sel[i] = 3 * k + (l % 3);
6192           k++;
6193         }
6194       vec_perm_indices indices (sel, 2, nelt);
6195       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6196         {
6197           if (dump_enabled_p ())
6198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6199                              "shuffle of 3 fields structure is not \
6200                               supported by target\n");
6201           return false;
6202         }
6203       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6204
6205       /* Generating permutation constant to shift all elements.
6206          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6207       for (i = 0; i < nelt; i++)
6208         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6209       indices.new_vector (sel, 2, nelt);
6210       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6211         {
6212           if (dump_enabled_p ())
6213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6214                              "shift permutation is not supported by target\n");
6215           return false;
6216         }
6217       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6218
6219       /* Generating permutation constant to shift all elements.
6220          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6221       for (i = 0; i < nelt; i++)
6222         sel[i] = 2 * (nelt / 3) + 1 + i;
6223       indices.new_vector (sel, 2, nelt);
6224       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6225         {
6226           if (dump_enabled_p ())
6227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6228                              "shift permutation is not supported by target\n");
6229           return false;
6230         }
6231       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6232
6233       /* Generating permutation constant to shift all elements.
6234          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6235       for (i = 0; i < nelt; i++)
6236         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6237       indices.new_vector (sel, 2, nelt);
6238       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6239         {
6240           if (dump_enabled_p ())
6241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6242                              "shift permutation is not supported by target\n");
6243           return false;
6244         }
6245       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6246
6247       /* Generating permutation constant to shift all elements.
6248          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6249       for (i = 0; i < nelt; i++)
6250         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6251       indices.new_vector (sel, 2, nelt);
6252       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6253         {
6254           if (dump_enabled_p ())
6255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256                              "shift permutation is not supported by target\n");
6257           return false;
6258         }
6259       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6260
6261       for (k = 0; k < 3; k++)
6262         {
6263           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6264           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6265                                            dr_chain[k], dr_chain[k],
6266                                            perm3_mask);
6267           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6268           vect[k] = data_ref;
6269         }
6270
6271       for (k = 0; k < 3; k++)
6272         {
6273           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6274           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6275                                            vect[k % 3], vect[(k + 1) % 3],
6276                                            shift1_mask);
6277           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6278           vect_shift[k] = data_ref;
6279         }
6280
6281       for (k = 0; k < 3; k++)
6282         {
6283           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6284           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6285                                            vect_shift[(4 - k) % 3],
6286                                            vect_shift[(3 - k) % 3],
6287                                            shift2_mask);
6288           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6289           vect[k] = data_ref;
6290         }
6291
6292       (*result_chain)[3 - (nelt % 3)] = vect[2];
6293
6294       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6295       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6296                                        vect[0], shift3_mask);
6297       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6298       (*result_chain)[nelt % 3] = data_ref;
6299
6300       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6301       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6302                                        vect[1], shift4_mask);
6303       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6304       (*result_chain)[0] = data_ref;
6305       return true;
6306     }
6307   return false;
6308 }
6309
6310 /* Function vect_transform_grouped_load.
6311
6312    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6313    to perform their permutation and ascribe the result vectorized statements to
6314    the scalar statements.
6315 */
6316
6317 void
6318 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6319                              gimple_stmt_iterator *gsi)
6320 {
6321   machine_mode mode;
6322   vec<tree> result_chain = vNULL;
6323
6324   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6325      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6326      vectors, that are ready for vector computation.  */
6327   result_chain.create (size);
6328
6329   /* If reassociation width for vector type is 2 or greater target machine can
6330      execute 2 or more vector instructions in parallel.  Otherwise try to
6331      get chain for loads group using vect_shift_permute_load_chain.  */
6332   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6333   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6334       || pow2p_hwi (size)
6335       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6336                                          gsi, &result_chain))
6337     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6338   vect_record_grouped_load_vectors (stmt, result_chain);
6339   result_chain.release ();
6340 }
6341
6342 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6343    generated as part of the vectorization of STMT.  Assign the statement
6344    for each vector to the associated scalar statement.  */
6345
6346 void
6347 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6348 {
6349   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6350   gimple *next_stmt, *new_stmt;
6351   unsigned int i, gap_count;
6352   tree tmp_data_ref;
6353
6354   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6355      Since we scan the chain starting from it's first node, their order
6356      corresponds the order of data-refs in RESULT_CHAIN.  */
6357   next_stmt = first_stmt;
6358   gap_count = 1;
6359   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6360     {
6361       if (!next_stmt)
6362         break;
6363
6364       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6365        code elimination pass later.  No need to check for the first stmt in
6366        the group, since it always exists.
6367        DR_GROUP_GAP is the number of steps in elements from the previous
6368        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6369        correspond to the gaps.  */
6370       if (next_stmt != first_stmt
6371           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6372       {
6373         gap_count++;
6374         continue;
6375       }
6376
6377       while (next_stmt)
6378         {
6379           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6380           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6381              copies, and we put the new vector statement in the first available
6382              RELATED_STMT.  */
6383           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6384             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6385           else
6386             {
6387               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6388                 {
6389                   gimple *prev_stmt =
6390                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6391                   gimple *rel_stmt =
6392                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6393                   while (rel_stmt)
6394                     {
6395                       prev_stmt = rel_stmt;
6396                       rel_stmt =
6397                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6398                     }
6399
6400                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6401                     new_stmt;
6402                 }
6403             }
6404
6405           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6406           gap_count = 1;
6407           /* If NEXT_STMT accesses the same DR as the previous statement,
6408              put the same TMP_DATA_REF as its vectorized statement; otherwise
6409              get the next data-ref from RESULT_CHAIN.  */
6410           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6411             break;
6412         }
6413     }
6414 }
6415
6416 /* Function vect_force_dr_alignment_p.
6417
6418    Returns whether the alignment of a DECL can be forced to be aligned
6419    on ALIGNMENT bit boundary.  */
6420
6421 bool
6422 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6423 {
6424   if (!VAR_P (decl))
6425     return false;
6426
6427   if (decl_in_symtab_p (decl)
6428       && !symtab_node::get (decl)->can_increase_alignment_p ())
6429     return false;
6430
6431   if (TREE_STATIC (decl))
6432     return (alignment <= MAX_OFILE_ALIGNMENT);
6433   else
6434     return (alignment <= MAX_STACK_ALIGNMENT);
6435 }
6436
6437
6438 /* Return whether the data reference DR is supported with respect to its
6439    alignment.
6440    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6441    it is aligned, i.e., check if it is possible to vectorize it with different
6442    alignment.  */
6443
6444 enum dr_alignment_support
6445 vect_supportable_dr_alignment (struct data_reference *dr,
6446                                bool check_aligned_accesses)
6447 {
6448   gimple *stmt = vect_dr_stmt (dr);
6449   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6450   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6451   machine_mode mode = TYPE_MODE (vectype);
6452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6453   struct loop *vect_loop = NULL;
6454   bool nested_in_vect_loop = false;
6455
6456   if (aligned_access_p (dr) && !check_aligned_accesses)
6457     return dr_aligned;
6458
6459   /* For now assume all conditional loads/stores support unaligned
6460      access without any special code.  */
6461   if (is_gimple_call (stmt)
6462       && gimple_call_internal_p (stmt)
6463       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6464           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6465     return dr_unaligned_supported;
6466
6467   if (loop_vinfo)
6468     {
6469       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6470       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6471     }
6472
6473   /* Possibly unaligned access.  */
6474
6475   /* We can choose between using the implicit realignment scheme (generating
6476      a misaligned_move stmt) and the explicit realignment scheme (generating
6477      aligned loads with a REALIGN_LOAD).  There are two variants to the
6478      explicit realignment scheme: optimized, and unoptimized.
6479      We can optimize the realignment only if the step between consecutive
6480      vector loads is equal to the vector size.  Since the vector memory
6481      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6482      is guaranteed that the misalignment amount remains the same throughout the
6483      execution of the vectorized loop.  Therefore, we can create the
6484      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6485      at the loop preheader.
6486
6487      However, in the case of outer-loop vectorization, when vectorizing a
6488      memory access in the inner-loop nested within the LOOP that is now being
6489      vectorized, while it is guaranteed that the misalignment of the
6490      vectorized memory access will remain the same in different outer-loop
6491      iterations, it is *not* guaranteed that is will remain the same throughout
6492      the execution of the inner-loop.  This is because the inner-loop advances
6493      with the original scalar step (and not in steps of VS).  If the inner-loop
6494      step happens to be a multiple of VS, then the misalignment remains fixed
6495      and we can use the optimized realignment scheme.  For example:
6496
6497       for (i=0; i<N; i++)
6498         for (j=0; j<M; j++)
6499           s += a[i+j];
6500
6501      When vectorizing the i-loop in the above example, the step between
6502      consecutive vector loads is 1, and so the misalignment does not remain
6503      fixed across the execution of the inner-loop, and the realignment cannot
6504      be optimized (as illustrated in the following pseudo vectorized loop):
6505
6506       for (i=0; i<N; i+=4)
6507         for (j=0; j<M; j++){
6508           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6509                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6510                          // (assuming that we start from an aligned address).
6511           }
6512
6513      We therefore have to use the unoptimized realignment scheme:
6514
6515       for (i=0; i<N; i+=4)
6516           for (j=k; j<M; j+=4)
6517           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6518                            // that the misalignment of the initial address is
6519                            // 0).
6520
6521      The loop can then be vectorized as follows:
6522
6523       for (k=0; k<4; k++){
6524         rt = get_realignment_token (&vp[k]);
6525         for (i=0; i<N; i+=4){
6526           v1 = vp[i+k];
6527           for (j=k; j<M; j+=4){
6528             v2 = vp[i+j+VS-1];
6529             va = REALIGN_LOAD <v1,v2,rt>;
6530             vs += va;
6531             v1 = v2;
6532           }
6533         }
6534     } */
6535
6536   if (DR_IS_READ (dr))
6537     {
6538       bool is_packed = false;
6539       tree type = (TREE_TYPE (DR_REF (dr)));
6540
6541       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6542           && (!targetm.vectorize.builtin_mask_for_load
6543               || targetm.vectorize.builtin_mask_for_load ()))
6544         {
6545           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6546
6547           /* If we are doing SLP then the accesses need not have the
6548              same alignment, instead it depends on the SLP group size.  */
6549           if (loop_vinfo
6550               && STMT_SLP_TYPE (stmt_info)
6551               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6552                               * DR_GROUP_SIZE (vinfo_for_stmt
6553                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6554                               TYPE_VECTOR_SUBPARTS (vectype)))
6555             ;
6556           else if (!loop_vinfo
6557                    || (nested_in_vect_loop
6558                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6559                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6560             return dr_explicit_realign;
6561           else
6562             return dr_explicit_realign_optimized;
6563         }
6564       if (!known_alignment_for_access_p (dr))
6565         is_packed = not_size_aligned (DR_REF (dr));
6566
6567       if (targetm.vectorize.support_vector_misalignment
6568             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6569         /* Can't software pipeline the loads, but can at least do them.  */
6570         return dr_unaligned_supported;
6571     }
6572   else
6573     {
6574       bool is_packed = false;
6575       tree type = (TREE_TYPE (DR_REF (dr)));
6576
6577       if (!known_alignment_for_access_p (dr))
6578         is_packed = not_size_aligned (DR_REF (dr));
6579
6580      if (targetm.vectorize.support_vector_misalignment
6581            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6582        return dr_unaligned_supported;
6583     }
6584
6585   /* Unsupported.  */
6586   return dr_unaligned_unsupported;
6587 }