gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 286      emitted at the position of the first scalar load.
 287      Stores in a group are emitted at the position of the last scalar store.
 288      Compute that position and check whether the resulting order matches
 289      the current one.  */
 290   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 291   if (il_a)
 292     {
 293       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 294         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 295              s = DR_GROUP_NEXT_ELEMENT (s))
 296           il_a = get_later_stmt (il_a, s);
 297       else /* DR_IS_READ */
 298         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 299              s = DR_GROUP_NEXT_ELEMENT (s))
 300           if (get_later_stmt (il_a, s) == il_a)
 301             il_a = s;
 302     }
 303   else
 304     il_a = stmtinfo_a;
 305   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 306   if (il_b)
 307     {
 308       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 309         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 310              s = DR_GROUP_NEXT_ELEMENT (s))
 311           il_b = get_later_stmt (il_b, s);
 312       else /* DR_IS_READ */
 313         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 314              s = DR_GROUP_NEXT_ELEMENT (s))
 315           if (get_later_stmt (il_b, s) == il_b)
 316             il_b = s;
 317     }
 318   else
 319     il_b = stmtinfo_b;
 320   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 321   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 322 }
 323
 324 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 325    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 326    distances.  These distances are conservatively correct but they don't
 327    reflect a guaranteed dependence.
 328
 329    Return true if this function does all the work necessary to avoid
 330    an alias or false if the caller should use the dependence distances
 331    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 332    the depth of the loop described by LOOP_VINFO and the other arguments
 333    are as for vect_analyze_data_ref_dependence.  */
 334
 335 static bool
 336 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 337                                        loop_vec_info loop_vinfo,
 338                                        int loop_depth, unsigned int *max_vf)
 339 {
 340   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 341   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 342     {
 343       int dist = dist_v[loop_depth];
 344       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 345         {
 346           /* If the user asserted safelen >= DIST consecutive iterations
 347              can be executed concurrently, assume independence.
 348
 349              ??? An alternative would be to add the alias check even
 350              in this case, and vectorize the fallback loop with the
 351              maximum VF set to safelen.  However, if the user has
 352              explicitly given a length, it's less likely that that
 353              would be a win.  */
 354           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 355             {
 356               if ((unsigned int) loop->safelen < *max_vf)
 357                 *max_vf = loop->safelen;
 358               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 359               continue;
 360             }
 361
 362           /* For dependence distances of 2 or more, we have the option
 363              of limiting VF or checking for an alias at runtime.
 364              Prefer to check at runtime if we can, to avoid limiting
 365              the VF unnecessarily when the bases are in fact independent.
 366
 367              Note that the alias checks will be removed if the VF ends up
 368              being small enough.  */
 369           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 370           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 371           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 372                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 373                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 374         }
 375     }
 376   return true;
 377 }
 378
 379
 380 /* Function vect_analyze_data_ref_dependence.
 381
 382    FIXME: I needed to change the sense of the returned flag.
 383
 384    Return FALSE if there (might) exist a dependence between a memory-reference
 385    DRA and a memory-reference DRB.  When versioning for alias may check a
 386    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 387    the data dependence.  */
 388
 389 static opt_result
 390 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 391                                   loop_vec_info loop_vinfo,
 392                                   unsigned int *max_vf)
 393 {
 394   unsigned int i;
 395   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 396   struct data_reference *dra = DDR_A (ddr);
 397   struct data_reference *drb = DDR_B (ddr);
 398   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 399   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 400   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 401   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 402   lambda_vector dist_v;
 403   unsigned int loop_depth;
 404
 405   /* If user asserted safelen consecutive iterations can be
 406      executed concurrently, assume independence.  */
 407   auto apply_safelen = [&]()
 408     {
 409       if (loop->safelen >= 2)
 410         {
 411           if ((unsigned int) loop->safelen < *max_vf)
 412             *max_vf = loop->safelen;
 413           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 414           return true;
 415         }
 416       return false;
 417     };
 418
 419   /* In loop analysis all data references should be vectorizable.  */
 420   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 421       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 422     gcc_unreachable ();
 423
 424   /* Independent data accesses.  */
 425   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 426     return opt_result::success ();
 427
 428   if (dra == drb
 429       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 430     return opt_result::success ();
 431
 432   /* We do not have to consider dependences between accesses that belong
 433      to the same group, unless the stride could be smaller than the
 434      group size.  */
 435   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 436       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 437           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 438       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 439     return opt_result::success ();
 440
 441   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 442      least two scalar iterations, there is always also a true dependence.
 443      As the vectorizer does not re-order loads and stores we can ignore
 444      the anti-dependence if TBAA can disambiguate both DRs similar to the
 445      case with known negative distance anti-dependences (positive
 446      distance anti-dependences would violate TBAA constraints).  */
 447   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 448        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 449       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 450                                  get_alias_set (DR_REF (drb))))
 451     return opt_result::success ();
 452
 453   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455     {
 456       if (apply_safelen ())
 457         return opt_result::success ();
 458
 459       return opt_result::failure_at
 460         (stmtinfo_a->stmt,
 461          "possible alias involving gather/scatter between %T and %T\n",
 462          DR_REF (dra), DR_REF (drb));
 463     }
 464
 465   /* Unknown data dependence.  */
 466   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 467     {
 468       if (apply_safelen ())
 469         return opt_result::success ();
 470
 471       if (dump_enabled_p ())
 472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 473                          "versioning for alias required: "
 474                          "can't determine dependence between %T and %T\n",
 475                          DR_REF (dra), DR_REF (drb));
 476
 477       /* Add to list of ddrs that need to be tested at run-time.  */
 478       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 479     }
 480
 481   /* Known data dependence.  */
 482   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 483     {
 484       if (apply_safelen ())
 485         return opt_result::success ();
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 489                          "versioning for alias required: "
 490                          "bad dist vector for %T and %T\n",
 491                          DR_REF (dra), DR_REF (drb));
 492       /* Add to list of ddrs that need to be tested at run-time.  */
 493       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 494     }
 495
 496   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 497
 498   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 499       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 500                                                 loop_depth, max_vf))
 501     return opt_result::success ();
 502
 503   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 504     {
 505       int dist = dist_v[loop_depth];
 506
 507       if (dump_enabled_p ())
 508         dump_printf_loc (MSG_NOTE, vect_location,
 509                          "dependence distance  = %d.\n", dist);
 510
 511       if (dist == 0)
 512         {
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance == 0 between %T and %T\n",
 516                              DR_REF (dra), DR_REF (drb));
 517
 518           /* When we perform grouped accesses and perform implicit CSE
 519              by detecting equal accesses and doing disambiguation with
 520              runtime alias tests like for
 521                 .. = a[i];
 522                 .. = a[i+1];
 523                 a[i] = ..;
 524                 a[i+1] = ..;
 525                 *p = ..;
 526                 .. = a[i];
 527                 .. = a[i+1];
 528              where we will end up loading { a[i], a[i+1] } once, make
 529              sure that inserting group loads before the first load and
 530              stores after the last store will do the right thing.
 531              Similar for groups like
 532                 a[i] = ...;
 533                 ... = a[i];
 534                 a[i+1] = ...;
 535              where loads from the group interleave with the store.  */
 536           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 537             return opt_result::failure_at (stmtinfo_a->stmt,
 538                                            "READ_WRITE dependence"
 539                                            " in interleaving.\n");
 540
 541           if (loop->safelen < 2)
 542             {
 543               tree indicator = dr_zero_step_indicator (dra);
 544               if (!indicator || integer_zerop (indicator))
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "access also has a zero step\n");
 547               else if (TREE_CODE (indicator) != INTEGER_CST)
 548                 vect_check_nonzero_value (loop_vinfo, indicator);
 549             }
 550           continue;
 551         }
 552
 553       if (dist > 0 && DDR_REVERSED_P (ddr))
 554         {
 555           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 556              reversed (to make distance vector positive), and the actual
 557              distance is negative.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance negative.\n");
 561           /* When doing outer loop vectorization, we need to check if there is
 562              a backward dependence at the inner loop level if the dependence
 563              at the outer loop is reversed.  See PR81740.  */
 564           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 565               || nested_in_vect_loop_p (loop, stmtinfo_b))
 566             {
 567               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 568                                                          DDR_LOOP_NEST (ddr));
 569               if (dist_v[inner_depth] < 0)
 570                 return opt_result::failure_at (stmtinfo_a->stmt,
 571                                                "not vectorized, dependence "
 572                                                "between data-refs %T and %T\n",
 573                                                DR_REF (dra), DR_REF (drb));
 574             }
 575           /* Record a negative dependence distance to later limit the
 576              amount of stmt copying / unrolling we can perform.
 577              Only need to handle read-after-write dependence.  */
 578           if (DR_IS_READ (drb)
 579               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 580                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 581             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 582           continue;
 583         }
 584
 585       unsigned int abs_dist = abs (dist);
 586       if (abs_dist >= 2 && abs_dist < *max_vf)
 587         {
 588           /* The dependence distance requires reduction of the maximal
 589              vectorization factor.  */
 590           *max_vf = abs_dist;
 591           if (dump_enabled_p ())
 592             dump_printf_loc (MSG_NOTE, vect_location,
 593                              "adjusting maximal vectorization factor to %i\n",
 594                              *max_vf);
 595         }
 596
 597       if (abs_dist >= *max_vf)
 598         {
 599           /* Dependence distance does not create dependence, as far as
 600              vectorization is concerned, in this case.  */
 601           if (dump_enabled_p ())
 602             dump_printf_loc (MSG_NOTE, vect_location,
 603                              "dependence distance >= VF.\n");
 604           continue;
 605         }
 606
 607       return opt_result::failure_at (stmtinfo_a->stmt,
 608                                      "not vectorized, possible dependence "
 609                                      "between data-refs %T and %T\n",
 610                                      DR_REF (dra), DR_REF (drb));
 611     }
 612
 613   return opt_result::success ();
 614 }
 615
 616 /* Function vect_analyze_data_ref_dependences.
 617
 618    Examine all the data references in the loop, and make sure there do not
 619    exist any data dependences between them.  Set *MAX_VF according to
 620    the maximum vectorization factor the data dependences allow.  */
 621
 622 opt_result
 623 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 624                                    unsigned int *max_vf)
 625 {
 626   unsigned int i;
 627   struct data_dependence_relation *ddr;
 628
 629   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 630
 631   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 632     {
 633       LOOP_VINFO_DDRS (loop_vinfo)
 634         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 635                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 636       /* We do not need read-read dependences.  */
 637       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 638                                           &LOOP_VINFO_DDRS (loop_vinfo),
 639                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 640                                           false);
 641       gcc_assert (res);
 642     }
 643
 644   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 645
 646   /* For epilogues we either have no aliases or alias versioning
 647      was applied to original loop.  Therefore we may just get max_vf
 648      using VF of original loop.  */
 649   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 650     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 651   else
 652     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 653       {
 654         opt_result res
 655           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 656         if (!res)
 657           return res;
 658       }
 659
 660   return opt_result::success ();
 661 }
 662
 663
 664 /* Function vect_slp_analyze_data_ref_dependence.
 665
 666    Return TRUE if there (might) exist a dependence between a memory-reference
 667    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 668    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 669    according to the data dependence.  */
 670
 671 static bool
 672 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 673                                       struct data_dependence_relation *ddr)
 674 {
 675   struct data_reference *dra = DDR_A (ddr);
 676   struct data_reference *drb = DDR_B (ddr);
 677   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 678   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 679
 680   /* We need to check dependences of statements marked as unvectorizable
 681      as well, they still can prohibit vectorization.  */
 682
 683   /* Independent data accesses.  */
 684   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 685     return false;
 686
 687   if (dra == drb)
 688     return false;
 689
 690   /* Read-read is OK.  */
 691   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 692     return false;
 693
 694   /* If dra and drb are part of the same interleaving chain consider
 695      them independent.  */
 696   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 697       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 698           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 699     return false;
 700
 701   /* Unknown data dependence.  */
 702   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 703     {
 704       if  (dump_enabled_p ())
 705         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 706                          "can't determine dependence between %T and %T\n",
 707                          DR_REF (dra), DR_REF (drb));
 708     }
 709   else if (dump_enabled_p ())
 710     dump_printf_loc (MSG_NOTE, vect_location,
 711                      "determined dependence between %T and %T\n",
 712                      DR_REF (dra), DR_REF (drb));
 713
 714   return true;
 715 }
 716
 717
 718 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 719
 720 static bool
 721 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 722 {
 723   /* This walks over all stmts involved in the SLP store done
 724      in NODE verifying we can sink them up to the last stmt in the
 725      group.  */
 726   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 727   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 728
 729   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 730     {
 731       stmt_vec_info access_info
 732         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 733       if (access_info == last_access_info)
 734         continue;
 735       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 736       ao_ref ref;
 737       bool ref_initialized_p = false;
 738       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 739            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 740         {
 741           gimple *stmt = gsi_stmt (gsi);
 742           if (! gimple_vuse (stmt))
 743             continue;
 744
 745           /* If we couldn't record a (single) data reference for this
 746              stmt we have to resort to the alias oracle.  */
 747           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 748           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 749           if (!dr_b)
 750             {
 751               /* We are moving a store - this means
 752                  we cannot use TBAA for disambiguation.  */
 753               if (!ref_initialized_p)
 754                 ao_ref_init (&ref, DR_REF (dr_a));
 755               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 756                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 757                 return false;
 758               continue;
 759             }
 760
 761           gcc_assert (!gimple_visited_p (stmt));
 762
 763           ddr_p ddr = initialize_data_dependence_relation (dr_a,
 764                                                            dr_b, vNULL);
 765           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 766           free_dependence_relation (ddr);
 767           if (dependent)
 768             return false;
 769         }
 770     }
 771   return true;
 772 }
 773
 774 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
 775    contain the vector of scalar stores of this instance if we are
 776    disambiguating the loads.  */
 777
 778 static bool
 779 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
 780                                    vec<stmt_vec_info> stores,
 781                                    stmt_vec_info last_store_info)
 782 {
 783   /* This walks over all stmts involved in the SLP load done
 784      in NODE verifying we can hoist them up to the first stmt in the
 785      group.  */
 786   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
 787   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
 788
 789   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 790     {
 791       stmt_vec_info access_info
 792         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 793       if (access_info == first_access_info)
 794         continue;
 795       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 796       ao_ref ref;
 797       bool ref_initialized_p = false;
 798       hash_set<stmt_vec_info> grp_visited;
 799       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 800            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
 801         {
 802           gimple *stmt = gsi_stmt (gsi);
 803           if (! gimple_vdef (stmt))
 804             continue;
 805
 806           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 807
 808           /* If we run into a store of this same instance (we've just
 809              marked those) then delay dependence checking until we run
 810              into the last store because this is where it will have
 811              been sunk to (and we verified that we can do that already).  */
 812           if (gimple_visited_p (stmt))
 813             {
 814               if (stmt_info != last_store_info)
 815                 continue;
 816
 817               for (stmt_vec_info &store_info : stores)
 818                 {
 819                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 820                   ddr_p ddr = initialize_data_dependence_relation
 821                                 (dr_a, store_dr, vNULL);
 822                   bool dependent
 823                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 824                   free_dependence_relation (ddr);
 825                   if (dependent)
 826                     return false;
 827                 }
 828               continue;
 829             }
 830
 831           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
 832             {
 833               /* We are hoisting a load - this means we can use TBAA for
 834                  disambiguation.  */
 835               if (!ref_initialized_p)
 836                 ao_ref_init (&ref, DR_REF (dr_a));
 837               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
 838                 {
 839                   /* If we couldn't record a (single) data reference for this
 840                      stmt we have to give up now.  */
 841                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 842                   if (!dr_b)
 843                     return false;
 844                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
 845                                                                    dr_b, vNULL);
 846                   bool dependent
 847                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 848                   free_dependence_relation (ddr);
 849                   if (dependent)
 850                     return false;
 851                 }
 852               /* No dependence.  */
 853               return true;
 854             };
 855           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 856             {
 857               /* When we run into a store group we have to honor
 858                  that earlier stores might be moved here.  We don't
 859                  know exactly which and where to since we lack a
 860                  back-mapping from DR to SLP node, so assume all
 861                  earlier stores are sunk here.  It's enough to
 862                  consider the last stmt of a group for this.
 863                  ???  Both this and the fact that we disregard that
 864                  the conflicting instance might be removed later
 865                  is overly conservative.  */
 866               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
 867                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
 868                      store_info != NULL;
 869                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
 870                   if ((store_info == stmt_info
 871                        || get_later_stmt (store_info, stmt_info) == stmt_info)
 872                       && !check_hoist (store_info))
 873                     return false;
 874             }
 875           else
 876             {
 877               if (!check_hoist (stmt_info))
 878                 return false;
 879             }
 880         }
 881     }
 882   return true;
 883 }
 884
 885
 886 /* Function vect_analyze_data_ref_dependences.
 887
 888    Examine all the data references in the basic-block, and make sure there
 889    do not exist any data dependences between them.  Set *MAX_VF according to
 890    the maximum vectorization factor the data dependences allow.  */
 891
 892 bool
 893 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
 894 {
 895   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 896
 897   /* The stores of this instance are at the root of the SLP tree.  */
 898   slp_tree store = NULL;
 899   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
 900     store = SLP_INSTANCE_TREE (instance);
 901
 902   /* Verify we can sink stores to the vectorized stmt insert location.  */
 903   stmt_vec_info last_store_info = NULL;
 904   if (store)
 905     {
 906       if (! vect_slp_analyze_store_dependences (vinfo, store))
 907         return false;
 908
 909       /* Mark stores in this instance and remember the last one.  */
 910       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 911       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 912         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 913     }
 914
 915   bool res = true;
 916
 917   /* Verify we can sink loads to the vectorized stmt insert location,
 918      special-casing stores of this instance.  */
 919   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
 920     if (! vect_slp_analyze_load_dependences (vinfo, load,
 921                                              store
 922                                              ? SLP_TREE_SCALAR_STMTS (store)
 923                                              : vNULL, last_store_info))
 924       {
 925         res = false;
 926         break;
 927       }
 928
 929   /* Unset the visited flag.  */
 930   if (store)
 931     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
 932       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 933
 934   return res;
 935 }
 936
 937 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
 938    applied.  */
 939
 940 int
 941 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
 942 {
 943   HOST_WIDE_INT diff = 0;
 944   /* Alignment is only analyzed for the first element of a DR group,
 945      use that but adjust misalignment by the offset of the access.  */
 946   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
 947     {
 948       dr_vec_info *first_dr
 949         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
 950       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
 951          INTEGER_CSTs and the first element in the group has the lowest
 952          address.  */
 953       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
 954               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
 955       gcc_assert (diff >= 0);
 956       dr_info = first_dr;
 957     }
 958
 959   int misalign = dr_info->misalignment;
 960   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
 961   if (misalign == DR_MISALIGNMENT_UNKNOWN)
 962     return misalign;
 963
 964   /* If the access is only aligned for a vector type with smaller alignment
 965      requirement the access has unknown misalignment.  */
 966   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
 967                 targetm.vectorize.preferred_vector_alignment (vectype)))
 968     return DR_MISALIGNMENT_UNKNOWN;
 969
 970   /* Apply the offset from the DR group start and the externally supplied
 971      offset which can for example result from a negative stride access.  */
 972   poly_int64 misalignment = misalign + diff + offset;
 973
 974   /* vect_compute_data_ref_alignment will have ensured that target_alignment
 975      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
 976   unsigned HOST_WIDE_INT target_alignment_c
 977     = dr_info->target_alignment.to_constant ();
 978   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
 979     return DR_MISALIGNMENT_UNKNOWN;
 980   return misalign;
 981 }
 982
 983 /* Record the base alignment guarantee given by DRB, which occurs
 984    in STMT_INFO.  */
 985
 986 static void
 987 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
 988                             innermost_loop_behavior *drb)
 989 {
 990   bool existed;
 991   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
 992     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 993   if (!existed || entry.second->base_alignment < drb->base_alignment)
 994     {
 995       entry = std::make_pair (stmt_info, drb);
 996       if (dump_enabled_p ())
 997         dump_printf_loc (MSG_NOTE, vect_location,
 998                          "recording new base alignment for %T\n"
 999                          "  alignment:    %d\n"
1000                          "  misalignment: %d\n"
1001                          "  based on:     %G",
1002                          drb->base_address,
1003                          drb->base_alignment,
1004                          drb->base_misalignment,
1005                          stmt_info->stmt);
1006     }
1007 }
1008
1009 /* If the region we're going to vectorize is reached, all unconditional
1010    data references occur at least once.  We can therefore pool the base
1011    alignment guarantees from each unconditional reference.  Do this by
1012    going through all the data references in VINFO and checking whether
1013    the containing statement makes the reference unconditionally.  If so,
1014    record the alignment of the base address in VINFO so that it can be
1015    used for all other references with the same base.  */
1016
1017 void
1018 vect_record_base_alignments (vec_info *vinfo)
1019 {
1020   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1021   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1022   for (data_reference *dr : vinfo->shared->datarefs)
1023     {
1024       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1025       stmt_vec_info stmt_info = dr_info->stmt;
1026       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1027           && STMT_VINFO_VECTORIZABLE (stmt_info)
1028           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1029         {
1030           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1031
1032           /* If DR is nested in the loop that is being vectorized, we can also
1033              record the alignment of the base wrt the outer loop.  */
1034           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1035             vect_record_base_alignment
1036               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1037         }
1038     }
1039 }
1040
1041 /* Function vect_compute_data_ref_alignment
1042
1043    Compute the misalignment of the data reference DR_INFO when vectorizing
1044    with VECTYPE.
1045
1046    Output:
1047    1. initialized misalignment info for DR_INFO
1048
1049    FOR NOW: No analysis is actually performed. Misalignment is calculated
1050    only for trivial cases. TODO.  */
1051
1052 static void
1053 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1054                                  tree vectype)
1055 {
1056   stmt_vec_info stmt_info = dr_info->stmt;
1057   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1058   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1059   class loop *loop = NULL;
1060   tree ref = DR_REF (dr_info->dr);
1061
1062   if (dump_enabled_p ())
1063     dump_printf_loc (MSG_NOTE, vect_location,
1064                      "vect_compute_data_ref_alignment:\n");
1065
1066   if (loop_vinfo)
1067     loop = LOOP_VINFO_LOOP (loop_vinfo);
1068
1069   /* Initialize misalignment to unknown.  */
1070   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1071
1072   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1073     return;
1074
1075   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1076   bool step_preserves_misalignment_p;
1077
1078   poly_uint64 vector_alignment
1079     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1080                  BITS_PER_UNIT);
1081   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1082
1083   /* If the main loop has peeled for alignment we have no way of knowing
1084      whether the data accesses in the epilogues are aligned.  We can't at
1085      compile time answer the question whether we have entered the main loop or
1086      not.  Fixes PR 92351.  */
1087   if (loop_vinfo)
1088     {
1089       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1090       if (orig_loop_vinfo
1091           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1092         return;
1093     }
1094
1095   unsigned HOST_WIDE_INT vect_align_c;
1096   if (!vector_alignment.is_constant (&vect_align_c))
1097     return;
1098
1099   /* No step for BB vectorization.  */
1100   if (!loop)
1101     {
1102       gcc_assert (integer_zerop (drb->step));
1103       step_preserves_misalignment_p = true;
1104     }
1105
1106   /* In case the dataref is in an inner-loop of the loop that is being
1107      vectorized (LOOP), we use the base and misalignment information
1108      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1109      stays the same throughout the execution of the inner-loop, which is why
1110      we have to check that the stride of the dataref in the inner-loop evenly
1111      divides by the vector alignment.  */
1112   else if (nested_in_vect_loop_p (loop, stmt_info))
1113     {
1114       step_preserves_misalignment_p
1115         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1116
1117       if (dump_enabled_p ())
1118         {
1119           if (step_preserves_misalignment_p)
1120             dump_printf_loc (MSG_NOTE, vect_location,
1121                              "inner step divides the vector alignment.\n");
1122           else
1123             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1124                              "inner step doesn't divide the vector"
1125                              " alignment.\n");
1126         }
1127     }
1128
1129   /* Similarly we can only use base and misalignment information relative to
1130      an innermost loop if the misalignment stays the same throughout the
1131      execution of the loop.  As above, this is the case if the stride of
1132      the dataref evenly divides by the alignment.  */
1133   else
1134     {
1135       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1136       step_preserves_misalignment_p
1137         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1138
1139       if (!step_preserves_misalignment_p && dump_enabled_p ())
1140         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141                          "step doesn't divide the vector alignment.\n");
1142     }
1143
1144   unsigned int base_alignment = drb->base_alignment;
1145   unsigned int base_misalignment = drb->base_misalignment;
1146
1147   /* Calculate the maximum of the pooled base address alignment and the
1148      alignment that we can compute for DR itself.  */
1149   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1150     = base_alignments->get (drb->base_address);
1151   if (entry
1152       && base_alignment < (*entry).second->base_alignment
1153       && (loop_vinfo
1154           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1155                               gimple_bb (entry->first->stmt))
1156               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1157                   || (entry->first->dr_aux.group <= dr_info->group)))))
1158     {
1159       base_alignment = entry->second->base_alignment;
1160       base_misalignment = entry->second->base_misalignment;
1161     }
1162
1163   if (drb->offset_alignment < vect_align_c
1164       || !step_preserves_misalignment_p
1165       /* We need to know whether the step wrt the vectorized loop is
1166          negative when computing the starting misalignment below.  */
1167       || TREE_CODE (drb->step) != INTEGER_CST)
1168     {
1169       if (dump_enabled_p ())
1170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1171                          "Unknown alignment for access: %T\n", ref);
1172       return;
1173     }
1174
1175   if (base_alignment < vect_align_c)
1176     {
1177       unsigned int max_alignment;
1178       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1179       if (max_alignment < vect_align_c
1180           || !vect_can_force_dr_alignment_p (base,
1181                                              vect_align_c * BITS_PER_UNIT))
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_NOTE, vect_location,
1185                              "can't force alignment of ref: %T\n", ref);
1186           return;
1187         }
1188
1189       /* Force the alignment of the decl.
1190          NOTE: This is the only change to the code we make during
1191          the analysis phase, before deciding to vectorize the loop.  */
1192       if (dump_enabled_p ())
1193         dump_printf_loc (MSG_NOTE, vect_location,
1194                          "force alignment of %T\n", ref);
1195
1196       dr_info->base_decl = base;
1197       dr_info->base_misaligned = true;
1198       base_misalignment = 0;
1199     }
1200   poly_int64 misalignment
1201     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1202
1203   unsigned int const_misalignment;
1204   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1205     {
1206       if (dump_enabled_p ())
1207         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208                          "Non-constant misalignment for access: %T\n", ref);
1209       return;
1210     }
1211
1212   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1213
1214   if (dump_enabled_p ())
1215     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216                      "misalign = %d bytes of ref %T\n",
1217                      const_misalignment, ref);
1218
1219   return;
1220 }
1221
1222 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1223    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1224    is made aligned via peeling.  */
1225
1226 static bool
1227 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1228                                          dr_vec_info *dr_peel_info)
1229 {
1230   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1231                   DR_TARGET_ALIGNMENT (dr_info)))
1232     {
1233       poly_offset_int diff
1234         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1235            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1236       if (known_eq (diff, 0)
1237           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1238         return true;
1239     }
1240   return false;
1241 }
1242
1243 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1244    aligned via peeling.  */
1245
1246 static bool
1247 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1248                                  dr_vec_info *dr_peel_info)
1249 {
1250   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1251                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1252       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1253                            DR_OFFSET (dr_peel_info->dr), 0)
1254       || !operand_equal_p (DR_STEP (dr_info->dr),
1255                            DR_STEP (dr_peel_info->dr), 0))
1256     return false;
1257
1258   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1259 }
1260
1261 /* Compute the value for dr_info->misalign so that the access appears
1262    aligned.  This is used by peeling to compensate for dr_misalignment
1263    applying the offset for negative step.  */
1264
1265 int
1266 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1267 {
1268   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1269     return 0;
1270
1271   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1272   poly_int64 misalignment
1273     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1274        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1275
1276   unsigned HOST_WIDE_INT target_alignment_c;
1277   int misalign;
1278   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1279       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1280     return DR_MISALIGNMENT_UNKNOWN;
1281   return misalign;
1282 }
1283
1284 /* Function vect_update_misalignment_for_peel.
1285    Sets DR_INFO's misalignment
1286    - to 0 if it has the same alignment as DR_PEEL_INFO,
1287    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1288    - to -1 (unknown) otherwise.
1289
1290    DR_INFO - the data reference whose misalignment is to be adjusted.
1291    DR_PEEL_INFO - the data reference whose misalignment is being made
1292                   zero in the vector loop by the peel.
1293    NPEEL - the number of iterations in the peel loop if the misalignment
1294            of DR_PEEL_INFO is known at compile time.  */
1295
1296 static void
1297 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1298                                    dr_vec_info *dr_peel_info, int npeel)
1299 {
1300   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1301   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1302     {
1303       SET_DR_MISALIGNMENT (dr_info,
1304                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1305       return;
1306     }
1307
1308   unsigned HOST_WIDE_INT alignment;
1309   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1310       && known_alignment_for_access_p (dr_info,
1311                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1312       && known_alignment_for_access_p (dr_peel_info,
1313                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1314     {
1315       int misal = dr_info->misalignment;
1316       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1317       misal &= alignment - 1;
1318       set_dr_misalignment (dr_info, misal);
1319       return;
1320     }
1321
1322   if (dump_enabled_p ())
1323     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1324                      "to unknown (-1).\n");
1325   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1326 }
1327
1328 /* Return true if alignment is relevant for DR_INFO.  */
1329
1330 static bool
1331 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1332 {
1333   stmt_vec_info stmt_info = dr_info->stmt;
1334
1335   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1336     return false;
1337
1338   /* For interleaving, only the alignment of the first access matters.  */
1339   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1340       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1341     return false;
1342
1343   /* Scatter-gather and invariant accesses continue to address individual
1344      scalars, so vector-level alignment is irrelevant.  */
1345   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1346       || integer_zerop (DR_STEP (dr_info->dr)))
1347     return false;
1348
1349   /* Strided accesses perform only component accesses, alignment is
1350      irrelevant for them.  */
1351   if (STMT_VINFO_STRIDED_P (stmt_info)
1352       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1353     return false;
1354
1355   return true;
1356 }
1357
1358 /* Given an memory reference EXP return whether its alignment is less
1359    than its size.  */
1360
1361 static bool
1362 not_size_aligned (tree exp)
1363 {
1364   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1365     return true;
1366
1367   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1368           > get_object_alignment (exp));
1369 }
1370
1371 /* Function vector_alignment_reachable_p
1372
1373    Return true if vector alignment for DR_INFO is reachable by peeling
1374    a few loop iterations.  Return false otherwise.  */
1375
1376 static bool
1377 vector_alignment_reachable_p (dr_vec_info *dr_info)
1378 {
1379   stmt_vec_info stmt_info = dr_info->stmt;
1380   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1381
1382   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1383     {
1384       /* For interleaved access we peel only if number of iterations in
1385          the prolog loop ({VF - misalignment}), is a multiple of the
1386          number of the interleaved accesses.  */
1387       int elem_size, mis_in_elements;
1388
1389       /* FORNOW: handle only known alignment.  */
1390       if (!known_alignment_for_access_p (dr_info, vectype))
1391         return false;
1392
1393       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1394       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1395       elem_size = vector_element_size (vector_size, nelements);
1396       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1397
1398       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1399         return false;
1400     }
1401
1402   /* If misalignment is known at the compile time then allow peeling
1403      only if natural alignment is reachable through peeling.  */
1404   if (known_alignment_for_access_p (dr_info, vectype)
1405       && !aligned_access_p (dr_info, vectype))
1406     {
1407       HOST_WIDE_INT elmsize =
1408                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1409       if (dump_enabled_p ())
1410         {
1411           dump_printf_loc (MSG_NOTE, vect_location,
1412                            "data size = %wd. misalignment = %d.\n", elmsize,
1413                            dr_misalignment (dr_info, vectype));
1414         }
1415       if (dr_misalignment (dr_info, vectype) % elmsize)
1416         {
1417           if (dump_enabled_p ())
1418             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419                              "data size does not divide the misalignment.\n");
1420           return false;
1421         }
1422     }
1423
1424   if (!known_alignment_for_access_p (dr_info, vectype))
1425     {
1426       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1427       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1428       if (dump_enabled_p ())
1429         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430                          "Unknown misalignment, %snaturally aligned\n",
1431                          is_packed ? "not " : "");
1432       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1433     }
1434
1435   return true;
1436 }
1437
1438
1439 /* Calculate the cost of the memory access represented by DR_INFO.  */
1440
1441 static void
1442 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1443                            dr_alignment_support alignment_support_scheme,
1444                            int misalignment,
1445                            unsigned int *inside_cost,
1446                            unsigned int *outside_cost,
1447                            stmt_vector_for_cost *body_cost_vec,
1448                            stmt_vector_for_cost *prologue_cost_vec)
1449 {
1450   stmt_vec_info stmt_info = dr_info->stmt;
1451   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1452   int ncopies;
1453
1454   if (PURE_SLP_STMT (stmt_info))
1455     ncopies = 1;
1456   else
1457     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1458
1459   if (DR_IS_READ (dr_info->dr))
1460     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1461                         misalignment, true, inside_cost,
1462                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1463   else
1464     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1465                          misalignment, inside_cost, body_cost_vec);
1466
1467   if (dump_enabled_p ())
1468     dump_printf_loc (MSG_NOTE, vect_location,
1469                      "vect_get_data_access_cost: inside_cost = %d, "
1470                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1471 }
1472
1473
1474 typedef struct _vect_peel_info
1475 {
1476   dr_vec_info *dr_info;
1477   int npeel;
1478   unsigned int count;
1479 } *vect_peel_info;
1480
1481 typedef struct _vect_peel_extended_info
1482 {
1483   vec_info *vinfo;
1484   struct _vect_peel_info peel_info;
1485   unsigned int inside_cost;
1486   unsigned int outside_cost;
1487 } *vect_peel_extended_info;
1488
1489
1490 /* Peeling hashtable helpers.  */
1491
1492 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1493 {
1494   static inline hashval_t hash (const _vect_peel_info *);
1495   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1496 };
1497
1498 inline hashval_t
1499 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1500 {
1501   return (hashval_t) peel_info->npeel;
1502 }
1503
1504 inline bool
1505 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1506 {
1507   return (a->npeel == b->npeel);
1508 }
1509
1510
1511 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1512
1513 static void
1514 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1515                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1516                           int npeel, bool supportable_if_not_aligned)
1517 {
1518   struct _vect_peel_info elem, *slot;
1519   _vect_peel_info **new_slot;
1520
1521   elem.npeel = npeel;
1522   slot = peeling_htab->find (&elem);
1523   if (slot)
1524     slot->count++;
1525   else
1526     {
1527       slot = XNEW (struct _vect_peel_info);
1528       slot->npeel = npeel;
1529       slot->dr_info = dr_info;
1530       slot->count = 1;
1531       new_slot = peeling_htab->find_slot (slot, INSERT);
1532       *new_slot = slot;
1533     }
1534
1535   /* If this DR is not supported with unknown misalignment then bias
1536      this slot when the cost model is disabled.  */
1537   if (!supportable_if_not_aligned
1538       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1539     slot->count += VECT_MAX_COST;
1540 }
1541
1542
1543 /* Traverse peeling hash table to find peeling option that aligns maximum
1544    number of data accesses.  */
1545
1546 int
1547 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1548                                      _vect_peel_extended_info *max)
1549 {
1550   vect_peel_info elem = *slot;
1551
1552   if (elem->count > max->peel_info.count
1553       || (elem->count == max->peel_info.count
1554           && max->peel_info.npeel > elem->npeel))
1555     {
1556       max->peel_info.npeel = elem->npeel;
1557       max->peel_info.count = elem->count;
1558       max->peel_info.dr_info = elem->dr_info;
1559     }
1560
1561   return 1;
1562 }
1563
1564 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1565    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1566    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1567    after peeling.  */
1568
1569 static void
1570 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1571                                 dr_vec_info *dr0_info,
1572                                 unsigned int *inside_cost,
1573                                 unsigned int *outside_cost,
1574                                 stmt_vector_for_cost *body_cost_vec,
1575                                 stmt_vector_for_cost *prologue_cost_vec,
1576                                 unsigned int npeel)
1577 {
1578   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1579
1580   bool dr0_alignment_known_p
1581     = (dr0_info
1582        && known_alignment_for_access_p (dr0_info,
1583                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1584
1585   for (data_reference *dr : datarefs)
1586     {
1587       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1588       if (!vect_relevant_for_alignment_p (dr_info))
1589         continue;
1590
1591       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1592       dr_alignment_support alignment_support_scheme;
1593       int misalignment;
1594       unsigned HOST_WIDE_INT alignment;
1595
1596       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1597                                             size_zero_node) < 0;
1598       poly_int64 off = 0;
1599       if (negative)
1600         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1601                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1602
1603       if (npeel == 0)
1604         misalignment = dr_misalignment (dr_info, vectype, off);
1605       else if (dr_info == dr0_info
1606                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1607         misalignment = 0;
1608       else if (!dr0_alignment_known_p
1609                || !known_alignment_for_access_p (dr_info, vectype)
1610                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1611         misalignment = DR_MISALIGNMENT_UNKNOWN;
1612       else
1613         {
1614           misalignment = dr_misalignment (dr_info, vectype, off);
1615           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1616           misalignment &= alignment - 1;
1617         }
1618       alignment_support_scheme
1619         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1620                                          misalignment);
1621
1622       vect_get_data_access_cost (loop_vinfo, dr_info,
1623                                  alignment_support_scheme, misalignment,
1624                                  inside_cost, outside_cost,
1625                                  body_cost_vec, prologue_cost_vec);
1626     }
1627 }
1628
1629 /* Traverse peeling hash table and calculate cost for each peeling option.
1630    Find the one with the lowest cost.  */
1631
1632 int
1633 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1634                                    _vect_peel_extended_info *min)
1635 {
1636   vect_peel_info elem = *slot;
1637   int dummy;
1638   unsigned int inside_cost = 0, outside_cost = 0;
1639   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1640   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1641                        epilogue_cost_vec;
1642
1643   prologue_cost_vec.create (2);
1644   body_cost_vec.create (2);
1645   epilogue_cost_vec.create (2);
1646
1647   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1648                                   &outside_cost, &body_cost_vec,
1649                                   &prologue_cost_vec, elem->npeel);
1650
1651   body_cost_vec.release ();
1652
1653   outside_cost += vect_get_known_peeling_cost
1654     (loop_vinfo, elem->npeel, &dummy,
1655      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1656      &prologue_cost_vec, &epilogue_cost_vec);
1657
1658   /* Prologue and epilogue costs are added to the target model later.
1659      These costs depend only on the scalar iteration cost, the
1660      number of peeling iterations finally chosen, and the number of
1661      misaligned statements.  So discard the information found here.  */
1662   prologue_cost_vec.release ();
1663   epilogue_cost_vec.release ();
1664
1665   if (inside_cost < min->inside_cost
1666       || (inside_cost == min->inside_cost
1667           && outside_cost < min->outside_cost))
1668     {
1669       min->inside_cost = inside_cost;
1670       min->outside_cost = outside_cost;
1671       min->peel_info.dr_info = elem->dr_info;
1672       min->peel_info.npeel = elem->npeel;
1673       min->peel_info.count = elem->count;
1674     }
1675
1676   return 1;
1677 }
1678
1679
1680 /* Choose best peeling option by traversing peeling hash table and either
1681    choosing an option with the lowest cost (if cost model is enabled) or the
1682    option that aligns as many accesses as possible.  */
1683
1684 static struct _vect_peel_extended_info
1685 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1686                                        loop_vec_info loop_vinfo)
1687 {
1688    struct _vect_peel_extended_info res;
1689
1690    res.peel_info.dr_info = NULL;
1691    res.vinfo = loop_vinfo;
1692
1693    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1694      {
1695        res.inside_cost = INT_MAX;
1696        res.outside_cost = INT_MAX;
1697        peeling_htab->traverse <_vect_peel_extended_info *,
1698                                vect_peeling_hash_get_lowest_cost> (&res);
1699      }
1700    else
1701      {
1702        res.peel_info.count = 0;
1703        peeling_htab->traverse <_vect_peel_extended_info *,
1704                                vect_peeling_hash_get_most_frequent> (&res);
1705        res.inside_cost = 0;
1706        res.outside_cost = 0;
1707      }
1708
1709    return res;
1710 }
1711
1712 /* Return true if the new peeling NPEEL is supported.  */
1713
1714 static bool
1715 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1716                           unsigned npeel)
1717 {
1718   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1719   enum dr_alignment_support supportable_dr_alignment;
1720
1721   bool dr0_alignment_known_p
1722     = known_alignment_for_access_p (dr0_info,
1723                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1724
1725   /* Ensure that all data refs can be vectorized after the peel.  */
1726   for (data_reference *dr : datarefs)
1727     {
1728       if (dr == dr0_info->dr)
1729         continue;
1730
1731       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1732       if (!vect_relevant_for_alignment_p (dr_info)
1733           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1734         continue;
1735
1736       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1737       int misalignment;
1738       unsigned HOST_WIDE_INT alignment;
1739       if (!dr0_alignment_known_p
1740           || !known_alignment_for_access_p (dr_info, vectype)
1741           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1742         misalignment = DR_MISALIGNMENT_UNKNOWN;
1743       else
1744         {
1745           misalignment = dr_misalignment (dr_info, vectype);
1746           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1747           misalignment &= alignment - 1;
1748         }
1749       supportable_dr_alignment
1750         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1751                                          misalignment);
1752       if (supportable_dr_alignment == dr_unaligned_unsupported)
1753         return false;
1754     }
1755
1756   return true;
1757 }
1758
1759 /* Compare two data-references DRA and DRB to group them into chunks
1760    with related alignment.  */
1761
1762 static int
1763 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1764 {
1765   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1766   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1767   int cmp;
1768
1769   /* Stabilize sort.  */
1770   if (dra == drb)
1771     return 0;
1772
1773   /* Ordering of DRs according to base.  */
1774   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
1775                                DR_BASE_ADDRESS (drb));
1776   if (cmp != 0)
1777     return cmp;
1778
1779   /* And according to DR_OFFSET.  */
1780   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
1781   if (cmp != 0)
1782     return cmp;
1783
1784   /* And after step.  */
1785   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
1786   if (cmp != 0)
1787     return cmp;
1788
1789   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
1790   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
1791   if (cmp == 0)
1792     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
1793   return cmp;
1794 }
1795
1796 /* Function vect_enhance_data_refs_alignment
1797
1798    This pass will use loop versioning and loop peeling in order to enhance
1799    the alignment of data references in the loop.
1800
1801    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1802    original loop is to be vectorized.  Any other loops that are created by
1803    the transformations performed in this pass - are not supposed to be
1804    vectorized.  This restriction will be relaxed.
1805
1806    This pass will require a cost model to guide it whether to apply peeling
1807    or versioning or a combination of the two.  For example, the scheme that
1808    intel uses when given a loop with several memory accesses, is as follows:
1809    choose one memory access ('p') which alignment you want to force by doing
1810    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1811    other accesses are not necessarily aligned, or (2) use loop versioning to
1812    generate one loop in which all accesses are aligned, and another loop in
1813    which only 'p' is necessarily aligned.
1814
1815    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1816    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1817    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1818
1819    Devising a cost model is the most critical aspect of this work.  It will
1820    guide us on which access to peel for, whether to use loop versioning, how
1821    many versions to create, etc.  The cost model will probably consist of
1822    generic considerations as well as target specific considerations (on
1823    powerpc for example, misaligned stores are more painful than misaligned
1824    loads).
1825
1826    Here are the general steps involved in alignment enhancements:
1827
1828      -- original loop, before alignment analysis:
1829         for (i=0; i<N; i++){
1830           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1831           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1832         }
1833
1834      -- After vect_compute_data_refs_alignment:
1835         for (i=0; i<N; i++){
1836           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1837           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1838         }
1839
1840      -- Possibility 1: we do loop versioning:
1841      if (p is aligned) {
1842         for (i=0; i<N; i++){    # loop 1A
1843           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1844           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1845         }
1846      }
1847      else {
1848         for (i=0; i<N; i++){    # loop 1B
1849           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1850           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1851         }
1852      }
1853
1854      -- Possibility 2: we do loop peeling:
1855      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1856         x = q[i];
1857         p[i] = y;
1858      }
1859      for (i = 3; i < N; i++){   # loop 2A
1860         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1861         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1862      }
1863
1864      -- Possibility 3: combination of loop peeling and versioning:
1865      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1866         x = q[i];
1867         p[i] = y;
1868      }
1869      if (p is aligned) {
1870         for (i = 3; i<N; i++){  # loop 3A
1871           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1872           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1873         }
1874      }
1875      else {
1876         for (i = 3; i<N; i++){  # loop 3B
1877           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1878           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1879         }
1880      }
1881
1882      These loops are later passed to loop_transform to be vectorized.  The
1883      vectorizer will use the alignment information to guide the transformation
1884      (whether to generate regular loads/stores, or with special handling for
1885      misalignment).  */
1886
1887 opt_result
1888 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1889 {
1890   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1891   dr_vec_info *first_store = NULL;
1892   dr_vec_info *dr0_info = NULL;
1893   struct data_reference *dr;
1894   unsigned int i;
1895   bool do_peeling = false;
1896   bool do_versioning = false;
1897   unsigned int npeel = 0;
1898   bool one_misalignment_known = false;
1899   bool one_misalignment_unknown = false;
1900   bool one_dr_unsupportable = false;
1901   dr_vec_info *unsupportable_dr_info = NULL;
1902   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
1903   hash_table<peel_info_hasher> peeling_htab (1);
1904
1905   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1906
1907   /* Reset data so we can safely be called multiple times.  */
1908   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1909   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1910
1911   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
1912     return opt_result::success ();
1913
1914   /* Sort the vector of datarefs so DRs that have the same or dependent
1915      alignment are next to each other.  */
1916   auto_vec<data_reference_p> datarefs
1917     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
1918   datarefs.qsort (dr_align_group_sort_cmp);
1919
1920   /* Compute the number of DRs that become aligned when we peel
1921      a dataref so it becomes aligned.  */
1922   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
1923   n_same_align_refs.quick_grow_cleared (datarefs.length ());
1924   unsigned i0;
1925   for (i0 = 0; i0 < datarefs.length (); ++i0)
1926     if (DR_BASE_ADDRESS (datarefs[i0]))
1927       break;
1928   for (i = i0 + 1; i <= datarefs.length (); ++i)
1929     {
1930       if (i == datarefs.length ()
1931           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
1932                                DR_BASE_ADDRESS (datarefs[i]), 0)
1933           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
1934                                DR_OFFSET (datarefs[i]), 0)
1935           || !operand_equal_p (DR_STEP (datarefs[i0]),
1936                                DR_STEP (datarefs[i]), 0))
1937         {
1938           /* The subgroup [i0, i-1] now only differs in DR_INIT and
1939              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
1940              will get known misalignment if we align one of the refs
1941              with the largest DR_TARGET_ALIGNMENT.  */
1942           for (unsigned j = i0; j < i; ++j)
1943             {
1944               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
1945               for (unsigned k = i0; k < i; ++k)
1946                 {
1947                   if (k == j)
1948                     continue;
1949                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
1950                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
1951                                                                dr_infoj))
1952                     n_same_align_refs[j]++;
1953                 }
1954             }
1955           i0 = i;
1956         }
1957     }
1958
1959   /* While cost model enhancements are expected in the future, the high level
1960      view of the code at this time is as follows:
1961
1962      A) If there is a misaligned access then see if peeling to align
1963         this access can make all data references satisfy
1964         vect_supportable_dr_alignment.  If so, update data structures
1965         as needed and return true.
1966
1967      B) If peeling wasn't possible and there is a data reference with an
1968         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1969         then see if loop versioning checks can be used to make all data
1970         references satisfy vect_supportable_dr_alignment.  If so, update
1971         data structures as needed and return true.
1972
1973      C) If neither peeling nor versioning were successful then return false if
1974         any data reference does not satisfy vect_supportable_dr_alignment.
1975
1976      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1977
1978      Note, Possibility 3 above (which is peeling and versioning together) is not
1979      being done at this time.  */
1980
1981   /* (1) Peeling to force alignment.  */
1982
1983   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1984      Considerations:
1985      + How many accesses will become aligned due to the peeling
1986      - How many accesses will become unaligned due to the peeling,
1987        and the cost of misaligned accesses.
1988      - The cost of peeling (the extra runtime checks, the increase
1989        in code size).  */
1990
1991   FOR_EACH_VEC_ELT (datarefs, i, dr)
1992     {
1993       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1994       if (!vect_relevant_for_alignment_p (dr_info))
1995         continue;
1996
1997       stmt_vec_info stmt_info = dr_info->stmt;
1998       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1999       do_peeling = vector_alignment_reachable_p (dr_info);
2000       if (do_peeling)
2001         {
2002           if (known_alignment_for_access_p (dr_info, vectype))
2003             {
2004               unsigned int npeel_tmp = 0;
2005               bool negative = tree_int_cst_compare (DR_STEP (dr),
2006                                                     size_zero_node) < 0;
2007
2008               /* If known_alignment_for_access_p then we have set
2009                  DR_MISALIGNMENT which is only done if we know it at compiler
2010                  time, so it is safe to assume target alignment is constant.
2011                */
2012               unsigned int target_align =
2013                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2014               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2015               poly_int64 off = 0;
2016               if (negative)
2017                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2018               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2019               mis = negative ? mis : -mis;
2020               if (mis != 0)
2021                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2022
2023               /* For multiple types, it is possible that the bigger type access
2024                  will have more than one peeling option.  E.g., a loop with two
2025                  types: one of size (vector size / 4), and the other one of
2026                  size (vector size / 8).  Vectorization factor will 8.  If both
2027                  accesses are misaligned by 3, the first one needs one scalar
2028                  iteration to be aligned, and the second one needs 5.  But the
2029                  first one will be aligned also by peeling 5 scalar
2030                  iterations, and in that case both accesses will be aligned.
2031                  Hence, except for the immediate peeling amount, we also want
2032                  to try to add full vector size, while we don't exceed
2033                  vectorization factor.
2034                  We do this automatically for cost model, since we calculate
2035                  cost for every peeling option.  */
2036               poly_uint64 nscalars = npeel_tmp;
2037               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2038                 {
2039                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2040                   nscalars = (STMT_SLP_TYPE (stmt_info)
2041                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2042                 }
2043
2044               /* Save info about DR in the hash table.  Also include peeling
2045                  amounts according to the explanation above.  Indicate
2046                  the alignment status when the ref is not aligned.
2047                  ???  Rather than using unknown alignment here we should
2048                  prune all entries from the peeling hashtable which cause
2049                  DRs to be not supported.  */
2050               bool supportable_if_not_aligned
2051                 = vect_supportable_dr_alignment
2052                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2053               while (known_le (npeel_tmp, nscalars))
2054                 {
2055                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2056                                             dr_info, npeel_tmp,
2057                                             supportable_if_not_aligned);
2058                   npeel_tmp += MAX (1, target_align / dr_size);
2059                 }
2060
2061               one_misalignment_known = true;
2062             }
2063           else
2064             {
2065               /* If we don't know any misalignment values, we prefer
2066                  peeling for data-ref that has the maximum number of data-refs
2067                  with the same alignment, unless the target prefers to align
2068                  stores over load.  */
2069               unsigned same_align_drs = n_same_align_refs[i];
2070               if (!dr0_info
2071                   || dr0_same_align_drs < same_align_drs)
2072                 {
2073                   dr0_same_align_drs = same_align_drs;
2074                   dr0_info = dr_info;
2075                 }
2076               /* For data-refs with the same number of related
2077                  accesses prefer the one where the misalign
2078                  computation will be invariant in the outermost loop.  */
2079               else if (dr0_same_align_drs == same_align_drs)
2080                 {
2081                   class loop *ivloop0, *ivloop;
2082                   ivloop0 = outermost_invariant_loop_for_expr
2083                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2084                   ivloop = outermost_invariant_loop_for_expr
2085                     (loop, DR_BASE_ADDRESS (dr));
2086                   if ((ivloop && !ivloop0)
2087                       || (ivloop && ivloop0
2088                           && flow_loop_nested_p (ivloop, ivloop0)))
2089                     dr0_info = dr_info;
2090                 }
2091
2092               one_misalignment_unknown = true;
2093
2094               /* Check for data refs with unsupportable alignment that
2095                  can be peeled.  */
2096               enum dr_alignment_support supportable_dr_alignment
2097                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2098                                                  DR_MISALIGNMENT_UNKNOWN);
2099               if (supportable_dr_alignment == dr_unaligned_unsupported)
2100                 {
2101                   one_dr_unsupportable = true;
2102                   unsupportable_dr_info = dr_info;
2103                 }
2104
2105               if (!first_store && DR_IS_WRITE (dr))
2106                 {
2107                   first_store = dr_info;
2108                   first_store_same_align_drs = same_align_drs;
2109                 }
2110             }
2111         }
2112       else
2113         {
2114           if (!aligned_access_p (dr_info, vectype))
2115             {
2116               if (dump_enabled_p ())
2117                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118                                  "vector alignment may not be reachable\n");
2119               break;
2120             }
2121         }
2122     }
2123
2124   /* Check if we can possibly peel the loop.  */
2125   if (!vect_can_advance_ivs_p (loop_vinfo)
2126       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2127                                        LOOP_VINFO_IV_EXIT (loop_vinfo))
2128       || loop->inner)
2129     do_peeling = false;
2130
2131   struct _vect_peel_extended_info peel_for_known_alignment;
2132   struct _vect_peel_extended_info peel_for_unknown_alignment;
2133   struct _vect_peel_extended_info best_peel;
2134
2135   peel_for_unknown_alignment.inside_cost = INT_MAX;
2136   peel_for_unknown_alignment.outside_cost = INT_MAX;
2137   peel_for_unknown_alignment.peel_info.count = 0;
2138
2139   if (do_peeling
2140       && one_misalignment_unknown)
2141     {
2142       /* Check if the target requires to prefer stores over loads, i.e., if
2143          misaligned stores are more expensive than misaligned loads (taking
2144          drs with same alignment into account).  */
2145       unsigned int load_inside_cost = 0;
2146       unsigned int load_outside_cost = 0;
2147       unsigned int store_inside_cost = 0;
2148       unsigned int store_outside_cost = 0;
2149       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2150
2151       stmt_vector_for_cost dummy;
2152       dummy.create (2);
2153       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2154                                       &load_inside_cost,
2155                                       &load_outside_cost,
2156                                       &dummy, &dummy, estimated_npeels);
2157       dummy.release ();
2158
2159       if (first_store)
2160         {
2161           dummy.create (2);
2162           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2163                                           &store_inside_cost,
2164                                           &store_outside_cost,
2165                                           &dummy, &dummy,
2166                                           estimated_npeels);
2167           dummy.release ();
2168         }
2169       else
2170         {
2171           store_inside_cost = INT_MAX;
2172           store_outside_cost = INT_MAX;
2173         }
2174
2175       if (load_inside_cost > store_inside_cost
2176           || (load_inside_cost == store_inside_cost
2177               && load_outside_cost > store_outside_cost))
2178         {
2179           dr0_info = first_store;
2180           dr0_same_align_drs = first_store_same_align_drs;
2181           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2182           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2183         }
2184       else
2185         {
2186           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2187           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2188         }
2189
2190       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2191       prologue_cost_vec.create (2);
2192       epilogue_cost_vec.create (2);
2193
2194       int dummy2;
2195       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2196         (loop_vinfo, estimated_npeels, &dummy2,
2197          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2198          &prologue_cost_vec, &epilogue_cost_vec);
2199
2200       prologue_cost_vec.release ();
2201       epilogue_cost_vec.release ();
2202
2203       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2204     }
2205
2206   peel_for_unknown_alignment.peel_info.npeel = 0;
2207   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2208
2209   best_peel = peel_for_unknown_alignment;
2210
2211   peel_for_known_alignment.inside_cost = INT_MAX;
2212   peel_for_known_alignment.outside_cost = INT_MAX;
2213   peel_for_known_alignment.peel_info.count = 0;
2214   peel_for_known_alignment.peel_info.dr_info = NULL;
2215
2216   if (do_peeling && one_misalignment_known)
2217     {
2218       /* Peeling is possible, but there is no data access that is not supported
2219          unless aligned.  So we try to choose the best possible peeling from
2220          the hash table.  */
2221       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2222         (&peeling_htab, loop_vinfo);
2223     }
2224
2225   /* Compare costs of peeling for known and unknown alignment. */
2226   if (peel_for_known_alignment.peel_info.dr_info != NULL
2227       && peel_for_unknown_alignment.inside_cost
2228       >= peel_for_known_alignment.inside_cost)
2229     {
2230       best_peel = peel_for_known_alignment;
2231
2232       /* If the best peeling for known alignment has NPEEL == 0, perform no
2233          peeling at all except if there is an unsupportable dr that we can
2234          align.  */
2235       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2236         do_peeling = false;
2237     }
2238
2239   /* If there is an unsupportable data ref, prefer this over all choices so far
2240      since we'd have to discard a chosen peeling except when it accidentally
2241      aligned the unsupportable data ref.  */
2242   if (one_dr_unsupportable)
2243     dr0_info = unsupportable_dr_info;
2244   else if (do_peeling)
2245     {
2246       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2247          TODO: Use nopeel_outside_cost or get rid of it?  */
2248       unsigned nopeel_inside_cost = 0;
2249       unsigned nopeel_outside_cost = 0;
2250
2251       stmt_vector_for_cost dummy;
2252       dummy.create (2);
2253       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2254                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2255       dummy.release ();
2256
2257       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2258          costs will be recorded.  */
2259       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2260       prologue_cost_vec.create (2);
2261       epilogue_cost_vec.create (2);
2262
2263       int dummy2;
2264       nopeel_outside_cost += vect_get_known_peeling_cost
2265         (loop_vinfo, 0, &dummy2,
2266          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2267          &prologue_cost_vec, &epilogue_cost_vec);
2268
2269       prologue_cost_vec.release ();
2270       epilogue_cost_vec.release ();
2271
2272       npeel = best_peel.peel_info.npeel;
2273       dr0_info = best_peel.peel_info.dr_info;
2274
2275       /* If no peeling is not more expensive than the best peeling we
2276          have so far, don't perform any peeling.  */
2277       if (nopeel_inside_cost <= best_peel.inside_cost)
2278         do_peeling = false;
2279     }
2280
2281   if (do_peeling)
2282     {
2283       stmt_vec_info stmt_info = dr0_info->stmt;
2284       if (known_alignment_for_access_p (dr0_info,
2285                                         STMT_VINFO_VECTYPE (stmt_info)))
2286         {
2287           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2288                                                 size_zero_node) < 0;
2289           if (!npeel)
2290             {
2291               /* Since it's known at compile time, compute the number of
2292                  iterations in the peeled loop (the peeling factor) for use in
2293                  updating DR_MISALIGNMENT values.  The peeling factor is the
2294                  vectorization factor minus the misalignment as an element
2295                  count.  */
2296               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2297               poly_int64 off = 0;
2298               if (negative)
2299                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2300                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2301               unsigned int mis
2302                 = dr_misalignment (dr0_info, vectype, off);
2303               mis = negative ? mis : -mis;
2304               /* If known_alignment_for_access_p then we have set
2305                  DR_MISALIGNMENT which is only done if we know it at compiler
2306                  time, so it is safe to assume target alignment is constant.
2307                */
2308               unsigned int target_align =
2309                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2310               npeel = ((mis & (target_align - 1))
2311                        / vect_get_scalar_dr_size (dr0_info));
2312             }
2313
2314           /* For interleaved data access every iteration accesses all the
2315              members of the group, therefore we divide the number of iterations
2316              by the group size.  */
2317           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2318             npeel /= DR_GROUP_SIZE (stmt_info);
2319
2320           if (dump_enabled_p ())
2321             dump_printf_loc (MSG_NOTE, vect_location,
2322                              "Try peeling by %d\n", npeel);
2323         }
2324
2325       /* Ensure that all datarefs can be vectorized after the peel.  */
2326       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2327         do_peeling = false;
2328
2329       /* Check if all datarefs are supportable and log.  */
2330       if (do_peeling
2331           && npeel == 0
2332           && known_alignment_for_access_p (dr0_info,
2333                                            STMT_VINFO_VECTYPE (stmt_info)))
2334         return opt_result::success ();
2335
2336       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2337       if (do_peeling)
2338         {
2339           unsigned max_allowed_peel
2340             = param_vect_max_peeling_for_alignment;
2341           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2342             max_allowed_peel = 0;
2343           if (max_allowed_peel != (unsigned)-1)
2344             {
2345               unsigned max_peel = npeel;
2346               if (max_peel == 0)
2347                 {
2348                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2349                   unsigned HOST_WIDE_INT target_align_c;
2350                   if (target_align.is_constant (&target_align_c))
2351                     max_peel =
2352                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2353                   else
2354                     {
2355                       do_peeling = false;
2356                       if (dump_enabled_p ())
2357                         dump_printf_loc (MSG_NOTE, vect_location,
2358                           "Disable peeling, max peels set and vector"
2359                           " alignment unknown\n");
2360                     }
2361                 }
2362               if (max_peel > max_allowed_peel)
2363                 {
2364                   do_peeling = false;
2365                   if (dump_enabled_p ())
2366                     dump_printf_loc (MSG_NOTE, vect_location,
2367                         "Disable peeling, max peels reached: %d\n", max_peel);
2368                 }
2369             }
2370         }
2371
2372       /* Cost model #2 - if peeling may result in a remaining loop not
2373          iterating enough to be vectorized then do not peel.  Since this
2374          is a cost heuristic rather than a correctness decision, use the
2375          most likely runtime value for variable vectorization factors.  */
2376       if (do_peeling
2377           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2378         {
2379           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2380           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2381           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2382               < assumed_vf + max_peel)
2383             do_peeling = false;
2384         }
2385
2386       if (do_peeling)
2387         {
2388           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2389              If the misalignment of DR_i is identical to that of dr0 then set
2390              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2391              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2392              by the peeling factor times the element size of DR_i (MOD the
2393              vectorization factor times the size).  Otherwise, the
2394              misalignment of DR_i must be set to unknown.  */
2395           FOR_EACH_VEC_ELT (datarefs, i, dr)
2396             if (dr != dr0_info->dr)
2397               {
2398                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2399                 if (!vect_relevant_for_alignment_p (dr_info))
2400                   continue;
2401
2402                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2403               }
2404
2405           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2406           if (npeel)
2407             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2408           else
2409             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2410           SET_DR_MISALIGNMENT (dr0_info,
2411                                vect_dr_misalign_for_aligned_access (dr0_info));
2412           if (dump_enabled_p ())
2413             {
2414               dump_printf_loc (MSG_NOTE, vect_location,
2415                                "Alignment of access forced using peeling.\n");
2416               dump_printf_loc (MSG_NOTE, vect_location,
2417                                "Peeling for alignment will be applied.\n");
2418             }
2419
2420           /* The inside-loop cost will be accounted for in vectorizable_load
2421              and vectorizable_store correctly with adjusted alignments.
2422              Drop the body_cst_vec on the floor here.  */
2423           return opt_result::success ();
2424         }
2425     }
2426
2427   /* (2) Versioning to force alignment.  */
2428
2429   /* Try versioning if:
2430      1) optimize loop for speed and the cost-model is not cheap
2431      2) there is at least one unsupported misaligned data ref with an unknown
2432         misalignment, and
2433      3) all misaligned data refs with a known misalignment are supported, and
2434      4) the number of runtime alignment checks is within reason.  */
2435
2436   do_versioning
2437     = (optimize_loop_nest_for_speed_p (loop)
2438        && !loop->inner /* FORNOW */
2439        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2440
2441   if (do_versioning)
2442     {
2443       FOR_EACH_VEC_ELT (datarefs, i, dr)
2444         {
2445           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2446           if (!vect_relevant_for_alignment_p (dr_info))
2447             continue;
2448
2449           stmt_vec_info stmt_info = dr_info->stmt;
2450           if (STMT_VINFO_STRIDED_P (stmt_info))
2451             {
2452               do_versioning = false;
2453               break;
2454             }
2455
2456           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2457           bool negative = tree_int_cst_compare (DR_STEP (dr),
2458                                                 size_zero_node) < 0;
2459           poly_int64 off = 0;
2460           if (negative)
2461             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2462                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2463           int misalignment;
2464           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2465             continue;
2466
2467           enum dr_alignment_support supportable_dr_alignment
2468             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2469                                              misalignment);
2470           if (supportable_dr_alignment == dr_unaligned_unsupported)
2471             {
2472               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2473                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2474                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2475                 {
2476                   do_versioning = false;
2477                   break;
2478                 }
2479
2480               /* At present we don't support versioning for alignment
2481                  with variable VF, since there's no guarantee that the
2482                  VF is a power of two.  We could relax this if we added
2483                  a way of enforcing a power-of-two size.  */
2484               unsigned HOST_WIDE_INT size;
2485               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2486                 {
2487                   do_versioning = false;
2488                   break;
2489                 }
2490
2491               /* Forcing alignment in the first iteration is no good if
2492                  we don't keep it across iterations.  For now, just disable
2493                  versioning in this case.
2494                  ?? We could actually unroll the loop to achieve the required
2495                  overall step alignment, and forcing the alignment could be
2496                  done by doing some iterations of the non-vectorized loop.  */
2497               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2498                                * DR_STEP_ALIGNMENT (dr),
2499                                DR_TARGET_ALIGNMENT (dr_info)))
2500                 {
2501                   do_versioning = false;
2502                   break;
2503                 }
2504
2505               /* The rightmost bits of an aligned address must be zeros.
2506                  Construct the mask needed for this test.  For example,
2507                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2508                  mask must be 15 = 0xf. */
2509               int mask = size - 1;
2510
2511               /* FORNOW: use the same mask to test all potentially unaligned
2512                  references in the loop.  */
2513               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2514                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2515                 {
2516                   do_versioning = false;
2517                   break;
2518                 }
2519
2520               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2521               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2522             }
2523         }
2524
2525       /* Versioning requires at least one misaligned data reference.  */
2526       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2527         do_versioning = false;
2528       else if (!do_versioning)
2529         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2530     }
2531
2532   if (do_versioning)
2533     {
2534       const vec<stmt_vec_info> &may_misalign_stmts
2535         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2536       stmt_vec_info stmt_info;
2537
2538       /* It can now be assumed that the data references in the statements
2539          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2540          of the loop being vectorized.  */
2541       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2542         {
2543           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2544           SET_DR_MISALIGNMENT (dr_info,
2545                                vect_dr_misalign_for_aligned_access (dr_info));
2546           if (dump_enabled_p ())
2547             dump_printf_loc (MSG_NOTE, vect_location,
2548                              "Alignment of access forced using versioning.\n");
2549         }
2550
2551       if (dump_enabled_p ())
2552         dump_printf_loc (MSG_NOTE, vect_location,
2553                          "Versioning for alignment will be applied.\n");
2554
2555       /* Peeling and versioning can't be done together at this time.  */
2556       gcc_assert (! (do_peeling && do_versioning));
2557
2558       return opt_result::success ();
2559     }
2560
2561   /* This point is reached if neither peeling nor versioning is being done.  */
2562   gcc_assert (! (do_peeling || do_versioning));
2563
2564   return opt_result::success ();
2565 }
2566
2567
2568 /* Function vect_analyze_data_refs_alignment
2569
2570    Analyze the alignment of the data-references in the loop.
2571    Return FALSE if a data reference is found that cannot be vectorized.  */
2572
2573 opt_result
2574 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2575 {
2576   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2577
2578   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2579   struct data_reference *dr;
2580   unsigned int i;
2581
2582   vect_record_base_alignments (loop_vinfo);
2583   FOR_EACH_VEC_ELT (datarefs, i, dr)
2584     {
2585       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2586       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2587         {
2588           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2589               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2590             continue;
2591           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2592                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2593         }
2594     }
2595
2596   return opt_result::success ();
2597 }
2598
2599
2600 /* Analyze alignment of DRs of stmts in NODE.  */
2601
2602 static bool
2603 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2604 {
2605   /* Alignment is maintained in the first element of the group.  */
2606   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2607   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2608   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2609   tree vectype = SLP_TREE_VECTYPE (node);
2610   poly_uint64 vector_alignment
2611     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2612                  BITS_PER_UNIT);
2613   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2614     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2615   /* Re-analyze alignment when we're facing a vectorization with a bigger
2616      alignment requirement.  */
2617   else if (known_lt (dr_info->target_alignment, vector_alignment))
2618     {
2619       poly_uint64 old_target_alignment = dr_info->target_alignment;
2620       int old_misalignment = dr_info->misalignment;
2621       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2622       /* But keep knowledge about a smaller alignment.  */
2623       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2624           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2625         {
2626           dr_info->target_alignment = old_target_alignment;
2627           dr_info->misalignment = old_misalignment;
2628         }
2629     }
2630   /* When we ever face unordered target alignments the first one wins in terms
2631      of analyzing and the other will become unknown in dr_misalignment.  */
2632   return true;
2633 }
2634
2635 /* Function vect_slp_analyze_instance_alignment
2636
2637    Analyze the alignment of the data-references in the SLP instance.
2638    Return FALSE if a data reference is found that cannot be vectorized.  */
2639
2640 bool
2641 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2642                                                 slp_instance instance)
2643 {
2644   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2645
2646   slp_tree node;
2647   unsigned i;
2648   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2649     if (! vect_slp_analyze_node_alignment (vinfo, node))
2650       return false;
2651
2652   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2653       && ! vect_slp_analyze_node_alignment
2654              (vinfo, SLP_INSTANCE_TREE (instance)))
2655     return false;
2656
2657   return true;
2658 }
2659
2660
2661 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2662    accesses of legal size, step, etc.  Detect gaps, single element
2663    interleaving, and other special cases. Set grouped access info.
2664    Collect groups of strided stores for further use in SLP analysis.
2665    Worker for vect_analyze_group_access.  */
2666
2667 static bool
2668 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2669 {
2670   data_reference *dr = dr_info->dr;
2671   tree step = DR_STEP (dr);
2672   tree scalar_type = TREE_TYPE (DR_REF (dr));
2673   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2674   stmt_vec_info stmt_info = dr_info->stmt;
2675   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2676   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2677   HOST_WIDE_INT dr_step = -1;
2678   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2679   bool slp_impossible = false;
2680
2681   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2682      size of the interleaving group (including gaps).  */
2683   if (tree_fits_shwi_p (step))
2684     {
2685       dr_step = tree_to_shwi (step);
2686       /* Check that STEP is a multiple of type size.  Otherwise there is
2687          a non-element-sized gap at the end of the group which we
2688          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2689          ???  As we can handle non-constant step fine here we should
2690          simply remove uses of DR_GROUP_GAP between the last and first
2691          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2692          simply not include that gap.  */
2693       if ((dr_step % type_size) != 0)
2694         {
2695           if (dump_enabled_p ())
2696             dump_printf_loc (MSG_NOTE, vect_location,
2697                              "Step %T is not a multiple of the element size"
2698                              " for %T\n",
2699                              step, DR_REF (dr));
2700           return false;
2701         }
2702       groupsize = absu_hwi (dr_step) / type_size;
2703     }
2704   else
2705     groupsize = 0;
2706
2707   /* Not consecutive access is possible only if it is a part of interleaving.  */
2708   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2709     {
2710       /* Check if it this DR is a part of interleaving, and is a single
2711          element of the group that is accessed in the loop.  */
2712
2713       /* Gaps are supported only for loads. STEP must be a multiple of the type
2714          size.  */
2715       if (DR_IS_READ (dr)
2716           && (dr_step % type_size) == 0
2717           && groupsize > 0
2718           /* This could be UINT_MAX but as we are generating code in a very
2719              inefficient way we have to cap earlier.
2720              See PR91403 for example.  */
2721           && groupsize <= 4096)
2722         {
2723           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2724           DR_GROUP_SIZE (stmt_info) = groupsize;
2725           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2726           if (dump_enabled_p ())
2727             dump_printf_loc (MSG_NOTE, vect_location,
2728                              "Detected single element interleaving %T"
2729                              " step %T\n",
2730                              DR_REF (dr), step);
2731
2732           return true;
2733         }
2734
2735       if (dump_enabled_p ())
2736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737                          "not consecutive access %G", stmt_info->stmt);
2738
2739       if (bb_vinfo)
2740         {
2741           /* Mark the statement as unvectorizable.  */
2742           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2743           return true;
2744         }
2745
2746       if (dump_enabled_p ())
2747         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2748       STMT_VINFO_STRIDED_P (stmt_info) = true;
2749       return true;
2750     }
2751
2752   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2753     {
2754       /* First stmt in the interleaving chain. Check the chain.  */
2755       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2756       struct data_reference *data_ref = dr;
2757       unsigned int count = 1;
2758       tree prev_init = DR_INIT (data_ref);
2759       HOST_WIDE_INT diff, gaps = 0;
2760
2761       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2762       while (next)
2763         {
2764           /* We never have the same DR multiple times.  */
2765           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2766                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2767
2768           data_ref = STMT_VINFO_DATA_REF (next);
2769
2770           /* All group members have the same STEP by construction.  */
2771           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2772
2773           /* Check that the distance between two accesses is equal to the type
2774              size. Otherwise, we have gaps.  */
2775           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2776                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2777           if (diff < 1 || diff > UINT_MAX)
2778             {
2779               /* For artificial testcases with array accesses with large
2780                  constant indices we can run into overflow issues which
2781                  can end up fooling the groupsize constraint below so
2782                  check the individual gaps (which are represented as
2783                  unsigned int) as well.  */
2784               if (dump_enabled_p ())
2785                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2786                                  "interleaved access with gap larger "
2787                                  "than representable\n");
2788               return false;
2789             }
2790           if (diff != 1)
2791             {
2792               /* FORNOW: SLP of accesses with gaps is not supported.  */
2793               slp_impossible = true;
2794               if (DR_IS_WRITE (data_ref))
2795                 {
2796                   if (dump_enabled_p ())
2797                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2798                                      "interleaved store with gaps\n");
2799                   return false;
2800                 }
2801
2802               gaps += diff - 1;
2803             }
2804
2805           last_accessed_element += diff;
2806
2807           /* Store the gap from the previous member of the group. If there is no
2808              gap in the access, DR_GROUP_GAP is always 1.  */
2809           DR_GROUP_GAP (next) = diff;
2810
2811           prev_init = DR_INIT (data_ref);
2812           next = DR_GROUP_NEXT_ELEMENT (next);
2813           /* Count the number of data-refs in the chain.  */
2814           count++;
2815         }
2816
2817       if (groupsize == 0)
2818         groupsize = count + gaps;
2819
2820       /* This could be UINT_MAX but as we are generating code in a very
2821          inefficient way we have to cap earlier.  See PR78699 for example.  */
2822       if (groupsize > 4096)
2823         {
2824           if (dump_enabled_p ())
2825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826                              "group is too large\n");
2827           return false;
2828         }
2829
2830       /* Check that the size of the interleaving is equal to count for stores,
2831          i.e., that there are no gaps.  */
2832       if (groupsize != count
2833           && !DR_IS_READ (dr))
2834         {
2835           groupsize = count;
2836           STMT_VINFO_STRIDED_P (stmt_info) = true;
2837         }
2838
2839       /* If there is a gap after the last load in the group it is the
2840          difference between the groupsize and the last accessed
2841          element.
2842          When there is no gap, this difference should be 0.  */
2843       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2844
2845       DR_GROUP_SIZE (stmt_info) = groupsize;
2846       if (dump_enabled_p ())
2847         {
2848           dump_printf_loc (MSG_NOTE, vect_location,
2849                            "Detected interleaving ");
2850           if (DR_IS_READ (dr))
2851             dump_printf (MSG_NOTE, "load ");
2852           else if (STMT_VINFO_STRIDED_P (stmt_info))
2853             dump_printf (MSG_NOTE, "strided store ");
2854           else
2855             dump_printf (MSG_NOTE, "store ");
2856           dump_printf (MSG_NOTE, "of size %u\n",
2857                        (unsigned)groupsize);
2858           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2859           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2860           while (next)
2861             {
2862               if (DR_GROUP_GAP (next) != 1)
2863                 dump_printf_loc (MSG_NOTE, vect_location,
2864                                  "\t<gap of %d elements>\n",
2865                                  DR_GROUP_GAP (next) - 1);
2866               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2867               next = DR_GROUP_NEXT_ELEMENT (next);
2868             }
2869           if (DR_GROUP_GAP (stmt_info) != 0)
2870             dump_printf_loc (MSG_NOTE, vect_location,
2871                              "\t<gap of %d elements>\n",
2872                              DR_GROUP_GAP (stmt_info));
2873         }
2874
2875       /* SLP: create an SLP data structure for every interleaving group of
2876          stores for further analysis in vect_analyse_slp.  */
2877       if (DR_IS_WRITE (dr) && !slp_impossible)
2878         {
2879           if (loop_vinfo)
2880             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2881           if (bb_vinfo)
2882             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2883         }
2884     }
2885
2886   return true;
2887 }
2888
2889 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2890    accesses of legal size, step, etc.  Detect gaps, single element
2891    interleaving, and other special cases. Set grouped access info.
2892    Collect groups of strided stores for further use in SLP analysis.  */
2893
2894 static bool
2895 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
2896 {
2897   if (!vect_analyze_group_access_1 (vinfo, dr_info))
2898     {
2899       /* Dissolve the group if present.  */
2900       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2901       while (stmt_info)
2902         {
2903           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2904           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2905           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2906           stmt_info = next;
2907         }
2908       return false;
2909     }
2910   return true;
2911 }
2912
2913 /* Analyze the access pattern of the data-reference DR_INFO.
2914    In case of non-consecutive accesses call vect_analyze_group_access() to
2915    analyze groups of accesses.  */
2916
2917 static bool
2918 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
2919 {
2920   data_reference *dr = dr_info->dr;
2921   tree step = DR_STEP (dr);
2922   tree scalar_type = TREE_TYPE (DR_REF (dr));
2923   stmt_vec_info stmt_info = dr_info->stmt;
2924   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2925   class loop *loop = NULL;
2926
2927   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2928     return true;
2929
2930   if (loop_vinfo)
2931     loop = LOOP_VINFO_LOOP (loop_vinfo);
2932
2933   if (loop_vinfo && !step)
2934     {
2935       if (dump_enabled_p ())
2936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937                          "bad data-ref access in loop\n");
2938       return false;
2939     }
2940
2941   /* Allow loads with zero step in inner-loop vectorization.  */
2942   if (loop_vinfo && integer_zerop (step))
2943     {
2944       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2945       if (!nested_in_vect_loop_p (loop, stmt_info))
2946         return DR_IS_READ (dr);
2947       /* Allow references with zero step for outer loops marked
2948          with pragma omp simd only - it guarantees absence of
2949          loop-carried dependencies between inner loop iterations.  */
2950       if (loop->safelen < 2)
2951         {
2952           if (dump_enabled_p ())
2953             dump_printf_loc (MSG_NOTE, vect_location,
2954                              "zero step in inner loop of nest\n");
2955           return false;
2956         }
2957     }
2958
2959   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2960     {
2961       /* Interleaved accesses are not yet supported within outer-loop
2962         vectorization for references in the inner-loop.  */
2963       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2964
2965       /* For the rest of the analysis we use the outer-loop step.  */
2966       step = STMT_VINFO_DR_STEP (stmt_info);
2967       if (integer_zerop (step))
2968         {
2969           if (dump_enabled_p ())
2970             dump_printf_loc (MSG_NOTE, vect_location,
2971                              "zero step in outer loop.\n");
2972           return DR_IS_READ (dr);
2973         }
2974     }
2975
2976   /* Consecutive?  */
2977   if (TREE_CODE (step) == INTEGER_CST)
2978     {
2979       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2980       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2981           || (dr_step < 0
2982               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2983         {
2984           /* Mark that it is not interleaving.  */
2985           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2986           return true;
2987         }
2988     }
2989
2990   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2991     {
2992       if (dump_enabled_p ())
2993         dump_printf_loc (MSG_NOTE, vect_location,
2994                          "grouped access in outer loop.\n");
2995       return false;
2996     }
2997
2998
2999   /* Assume this is a DR handled by non-constant strided load case.  */
3000   if (TREE_CODE (step) != INTEGER_CST)
3001     return (STMT_VINFO_STRIDED_P (stmt_info)
3002             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3003                 || vect_analyze_group_access (vinfo, dr_info)));
3004
3005   /* Not consecutive access - check if it's a part of interleaving group.  */
3006   return vect_analyze_group_access (vinfo, dr_info);
3007 }
3008
3009 /* Compare two data-references DRA and DRB to group them into chunks
3010    suitable for grouping.  */
3011
3012 static int
3013 dr_group_sort_cmp (const void *dra_, const void *drb_)
3014 {
3015   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3016   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3017   data_reference_p dra = dra_info->dr;
3018   data_reference_p drb = drb_info->dr;
3019   int cmp;
3020
3021   /* Stabilize sort.  */
3022   if (dra == drb)
3023     return 0;
3024
3025   /* Different group IDs lead never belong to the same group.  */
3026   if (dra_info->group != drb_info->group)
3027     return dra_info->group < drb_info->group ? -1 : 1;
3028
3029   /* Ordering of DRs according to base.  */
3030   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3031                                DR_BASE_ADDRESS (drb));
3032   if (cmp != 0)
3033     return cmp;
3034
3035   /* And according to DR_OFFSET.  */
3036   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3037   if (cmp != 0)
3038     return cmp;
3039
3040   /* Put reads before writes.  */
3041   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3042     return DR_IS_READ (dra) ? -1 : 1;
3043
3044   /* Then sort after access size.  */
3045   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3046                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3047   if (cmp != 0)
3048     return cmp;
3049
3050   /* And after step.  */
3051   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3052   if (cmp != 0)
3053     return cmp;
3054
3055   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3056   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3057   if (cmp == 0)
3058     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3059   return cmp;
3060 }
3061
3062 /* If OP is the result of a conversion, return the unconverted value,
3063    otherwise return null.  */
3064
3065 static tree
3066 strip_conversion (tree op)
3067 {
3068   if (TREE_CODE (op) != SSA_NAME)
3069     return NULL_TREE;
3070   gimple *stmt = SSA_NAME_DEF_STMT (op);
3071   if (!is_gimple_assign (stmt)
3072       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3073     return NULL_TREE;
3074   return gimple_assign_rhs1 (stmt);
3075 }
3076
3077 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3078    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3079    be grouped in SLP mode.  */
3080
3081 static bool
3082 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3083                    bool allow_slp_p)
3084 {
3085   if (gimple_assign_single_p (stmt1_info->stmt))
3086     return gimple_assign_single_p (stmt2_info->stmt);
3087
3088   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3089   if (call1 && gimple_call_internal_p (call1))
3090     {
3091       /* Check for two masked loads or two masked stores.  */
3092       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3093       if (!call2 || !gimple_call_internal_p (call2))
3094         return false;
3095       internal_fn ifn = gimple_call_internal_fn (call1);
3096       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3097         return false;
3098       if (ifn != gimple_call_internal_fn (call2))
3099         return false;
3100
3101       /* Check that the masks are the same.  Cope with casts of masks,
3102          like those created by build_mask_conversion.  */
3103       tree mask1 = gimple_call_arg (call1, 2);
3104       tree mask2 = gimple_call_arg (call2, 2);
3105       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3106         {
3107           mask1 = strip_conversion (mask1);
3108           if (!mask1)
3109             return false;
3110           mask2 = strip_conversion (mask2);
3111           if (!mask2)
3112             return false;
3113           if (!operand_equal_p (mask1, mask2, 0))
3114             return false;
3115         }
3116       return true;
3117     }
3118
3119   return false;
3120 }
3121
3122 /* Function vect_analyze_data_ref_accesses.
3123
3124    Analyze the access pattern of all the data references in the loop.
3125
3126    FORNOW: the only access pattern that is considered vectorizable is a
3127            simple step 1 (consecutive) access.
3128
3129    FORNOW: handle only arrays and pointer accesses.  */
3130
3131 opt_result
3132 vect_analyze_data_ref_accesses (vec_info *vinfo,
3133                                 vec<int> *dataref_groups)
3134 {
3135   unsigned int i;
3136   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3137
3138   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3139
3140   if (datarefs.is_empty ())
3141     return opt_result::success ();
3142
3143   /* Sort the array of datarefs to make building the interleaving chains
3144      linear.  Don't modify the original vector's order, it is needed for
3145      determining what dependencies are reversed.  */
3146   vec<dr_vec_info *> datarefs_copy;
3147   datarefs_copy.create (datarefs.length ());
3148   for (unsigned i = 0; i < datarefs.length (); i++)
3149     {
3150       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3151       /* If the caller computed DR grouping use that, otherwise group by
3152          basic blocks.  */
3153       if (dataref_groups)
3154         dr_info->group = (*dataref_groups)[i];
3155       else
3156         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3157       datarefs_copy.quick_push (dr_info);
3158     }
3159   datarefs_copy.qsort (dr_group_sort_cmp);
3160   hash_set<stmt_vec_info> to_fixup;
3161
3162   /* Build the interleaving chains.  */
3163   for (i = 0; i < datarefs_copy.length () - 1;)
3164     {
3165       dr_vec_info *dr_info_a = datarefs_copy[i];
3166       data_reference_p dra = dr_info_a->dr;
3167       int dra_group_id = dr_info_a->group;
3168       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3169       stmt_vec_info lastinfo = NULL;
3170       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3171           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3172         {
3173           ++i;
3174           continue;
3175         }
3176       for (i = i + 1; i < datarefs_copy.length (); ++i)
3177         {
3178           dr_vec_info *dr_info_b = datarefs_copy[i];
3179           data_reference_p drb = dr_info_b->dr;
3180           int drb_group_id = dr_info_b->group;
3181           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3182           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3183               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3184             break;
3185
3186           /* ???  Imperfect sorting (non-compatible types, non-modulo
3187              accesses, same accesses) can lead to a group to be artificially
3188              split here as we don't just skip over those.  If it really
3189              matters we can push those to a worklist and re-iterate
3190              over them.  The we can just skip ahead to the next DR here.  */
3191
3192           /* DRs in a different DR group should not be put into the same
3193              interleaving group.  */
3194           if (dra_group_id != drb_group_id)
3195             break;
3196
3197           /* Check that the data-refs have same first location (except init)
3198              and they are both either store or load (not load and store,
3199              not masked loads or stores).  */
3200           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3201               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3202                                         DR_BASE_ADDRESS (drb)) != 0
3203               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3204               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3205             break;
3206
3207           /* Check that the data-refs have the same constant size.  */
3208           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3209           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3210           if (!tree_fits_uhwi_p (sza)
3211               || !tree_fits_uhwi_p (szb)
3212               || !tree_int_cst_equal (sza, szb))
3213             break;
3214
3215           /* Check that the data-refs have the same step.  */
3216           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3217             break;
3218
3219           /* Check the types are compatible.
3220              ???  We don't distinguish this during sorting.  */
3221           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3222                                    TREE_TYPE (DR_REF (drb))))
3223             break;
3224
3225           /* Check that the DR_INITs are compile-time constants.  */
3226           if (!tree_fits_shwi_p (DR_INIT (dra))
3227               || !tree_fits_shwi_p (DR_INIT (drb)))
3228             break;
3229
3230           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3231              just hold extra information.  */
3232           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3233               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3234               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3235             break;
3236
3237           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3238           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3239           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3240           HOST_WIDE_INT init_prev
3241             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3242           gcc_assert (init_a <= init_b
3243                       && init_a <= init_prev
3244                       && init_prev <= init_b);
3245
3246           /* Do not place the same access in the interleaving chain twice.  */
3247           if (init_b == init_prev)
3248             {
3249               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3250                           < gimple_uid (DR_STMT (drb)));
3251               /* Simply link in duplicates and fix up the chain below.  */
3252             }
3253           else
3254             {
3255               /* If init_b == init_a + the size of the type * k, we have an
3256                  interleaving, and DRA is accessed before DRB.  */
3257               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3258               if (type_size_a == 0
3259                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3260                       % type_size_a != 0))
3261                 break;
3262
3263               /* If we have a store, the accesses are adjacent.  This splits
3264                  groups into chunks we support (we don't support vectorization
3265                  of stores with gaps).  */
3266               if (!DR_IS_READ (dra)
3267                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3268                       != type_size_a))
3269                 break;
3270
3271               /* If the step (if not zero or non-constant) is smaller than the
3272                  difference between data-refs' inits this splits groups into
3273                  suitable sizes.  */
3274               if (tree_fits_shwi_p (DR_STEP (dra)))
3275                 {
3276                   unsigned HOST_WIDE_INT step
3277                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3278                   if (step != 0
3279                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3280                     break;
3281                 }
3282             }
3283
3284           if (dump_enabled_p ())
3285             dump_printf_loc (MSG_NOTE, vect_location,
3286                              DR_IS_READ (dra)
3287                              ? "Detected interleaving load %T and %T\n"
3288                              : "Detected interleaving store %T and %T\n",
3289                              DR_REF (dra), DR_REF (drb));
3290
3291           /* Link the found element into the group list.  */
3292           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3293             {
3294               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3295               lastinfo = stmtinfo_a;
3296             }
3297           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3298           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3299           lastinfo = stmtinfo_b;
3300
3301           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3302             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3303
3304           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3305             dump_printf_loc (MSG_NOTE, vect_location,
3306                              "Load suitable for SLP vectorization only.\n");
3307
3308           if (init_b == init_prev
3309               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3310               && dump_enabled_p ())
3311             dump_printf_loc (MSG_NOTE, vect_location,
3312                              "Queuing group with duplicate access for fixup\n");
3313         }
3314     }
3315
3316   /* Fixup groups with duplicate entries by splitting it.  */
3317   while (1)
3318     {
3319       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3320       if (!(it != to_fixup.end ()))
3321         break;
3322       stmt_vec_info grp = *it;
3323       to_fixup.remove (grp);
3324
3325       /* Find the earliest duplicate group member.  */
3326       unsigned first_duplicate = -1u;
3327       stmt_vec_info next, g = grp;
3328       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3329         {
3330           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3331                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3332               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3333             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3334           g = next;
3335         }
3336       if (first_duplicate == -1U)
3337         continue;
3338
3339       /* Then move all stmts after the first duplicate to a new group.
3340          Note this is a heuristic but one with the property that *it
3341          is fixed up completely.  */
3342       g = grp;
3343       stmt_vec_info newgroup = NULL, ng = grp;
3344       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3345         {
3346           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3347             {
3348               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3349               if (!newgroup)
3350                 newgroup = next;
3351               else
3352                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3353               ng = next;
3354               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3355             }
3356           else
3357             g = DR_GROUP_NEXT_ELEMENT (g);
3358         }
3359       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3360
3361       /* Fixup the new group which still may contain duplicates.  */
3362       to_fixup.add (newgroup);
3363     }
3364
3365   dr_vec_info *dr_info;
3366   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3367     {
3368       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3369           && !vect_analyze_data_ref_access (vinfo, dr_info))
3370         {
3371           if (dump_enabled_p ())
3372             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3373                              "not vectorized: complicated access pattern.\n");
3374
3375           if (is_a <bb_vec_info> (vinfo))
3376             {
3377               /* Mark the statement as not vectorizable.  */
3378               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3379               continue;
3380             }
3381           else
3382             {
3383               datarefs_copy.release ();
3384               return opt_result::failure_at (dr_info->stmt->stmt,
3385                                              "not vectorized:"
3386                                              " complicated access pattern.\n");
3387             }
3388         }
3389     }
3390
3391   datarefs_copy.release ();
3392   return opt_result::success ();
3393 }
3394
3395 /* Function vect_vfa_segment_size.
3396
3397    Input:
3398      DR_INFO: The data reference.
3399      LENGTH_FACTOR: segment length to consider.
3400
3401    Return a value suitable for the dr_with_seg_len::seg_len field.
3402    This is the "distance travelled" by the pointer from the first
3403    iteration in the segment to the last.  Note that it does not include
3404    the size of the access; in effect it only describes the first byte.  */
3405
3406 static tree
3407 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3408 {
3409   length_factor = size_binop (MINUS_EXPR,
3410                               fold_convert (sizetype, length_factor),
3411                               size_one_node);
3412   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3413                      length_factor);
3414 }
3415
3416 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3417    gives the worst-case number of bytes covered by the segment.  */
3418
3419 static unsigned HOST_WIDE_INT
3420 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3421 {
3422   stmt_vec_info stmt_vinfo = dr_info->stmt;
3423   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3424   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3425   unsigned HOST_WIDE_INT access_size = ref_size;
3426   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3427     {
3428       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3429       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3430     }
3431   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3432   int misalignment;
3433   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3434       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3435       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3436           == dr_explicit_realign_optimized))
3437     {
3438       /* We might access a full vector's worth.  */
3439       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3440     }
3441   return access_size;
3442 }
3443
3444 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3445    describes.  */
3446
3447 static unsigned int
3448 vect_vfa_align (dr_vec_info *dr_info)
3449 {
3450   return dr_alignment (dr_info->dr);
3451 }
3452
3453 /* Function vect_no_alias_p.
3454
3455    Given data references A and B with equal base and offset, see whether
3456    the alias relation can be decided at compilation time.  Return 1 if
3457    it can and the references alias, 0 if it can and the references do
3458    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3459    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3460    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3461
3462 static int
3463 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3464                          tree segment_length_a, tree segment_length_b,
3465                          unsigned HOST_WIDE_INT access_size_a,
3466                          unsigned HOST_WIDE_INT access_size_b)
3467 {
3468   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3469   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3470   poly_uint64 const_length_a;
3471   poly_uint64 const_length_b;
3472
3473   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3474      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3475      [a, a+12) */
3476   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3477     {
3478       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3479       offset_a -= const_length_a;
3480     }
3481   else
3482     const_length_a = tree_to_poly_uint64 (segment_length_a);
3483   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3484     {
3485       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3486       offset_b -= const_length_b;
3487     }
3488   else
3489     const_length_b = tree_to_poly_uint64 (segment_length_b);
3490
3491   const_length_a += access_size_a;
3492   const_length_b += access_size_b;
3493
3494   if (ranges_known_overlap_p (offset_a, const_length_a,
3495                               offset_b, const_length_b))
3496     return 1;
3497
3498   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3499                                offset_b, const_length_b))
3500     return 0;
3501
3502   return -1;
3503 }
3504
3505 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3506    in DDR is >= VF.  */
3507
3508 static bool
3509 dependence_distance_ge_vf (data_dependence_relation *ddr,
3510                            unsigned int loop_depth, poly_uint64 vf)
3511 {
3512   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3513       || DDR_NUM_DIST_VECTS (ddr) == 0)
3514     return false;
3515
3516   /* If the dependence is exact, we should have limited the VF instead.  */
3517   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3518
3519   unsigned int i;
3520   lambda_vector dist_v;
3521   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3522     {
3523       HOST_WIDE_INT dist = dist_v[loop_depth];
3524       if (dist != 0
3525           && !(dist > 0 && DDR_REVERSED_P (ddr))
3526           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3527         return false;
3528     }
3529
3530   if (dump_enabled_p ())
3531     dump_printf_loc (MSG_NOTE, vect_location,
3532                      "dependence distance between %T and %T is >= VF\n",
3533                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3534
3535   return true;
3536 }
3537
3538 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3539
3540 static void
3541 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3542 {
3543   dump_printf (dump_kind, "%s (%T) >= ",
3544                lower_bound.unsigned_p ? "unsigned" : "abs",
3545                lower_bound.expr);
3546   dump_dec (dump_kind, lower_bound.min_value);
3547 }
3548
3549 /* Record that the vectorized loop requires the vec_lower_bound described
3550    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3551
3552 static void
3553 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3554                         poly_uint64 min_value)
3555 {
3556   vec<vec_lower_bound> &lower_bounds
3557     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3558   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3559     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3560       {
3561         unsigned_p &= lower_bounds[i].unsigned_p;
3562         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3563         if (lower_bounds[i].unsigned_p != unsigned_p
3564             || maybe_lt (lower_bounds[i].min_value, min_value))
3565           {
3566             lower_bounds[i].unsigned_p = unsigned_p;
3567             lower_bounds[i].min_value = min_value;
3568             if (dump_enabled_p ())
3569               {
3570                 dump_printf_loc (MSG_NOTE, vect_location,
3571                                  "updating run-time check to ");
3572                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3573                 dump_printf (MSG_NOTE, "\n");
3574               }
3575           }
3576         return;
3577       }
3578
3579   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3580   if (dump_enabled_p ())
3581     {
3582       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3583       dump_lower_bound (MSG_NOTE, lower_bound);
3584       dump_printf (MSG_NOTE, "\n");
3585     }
3586   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3587 }
3588
3589 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3590    will span fewer than GAP bytes.  */
3591
3592 static bool
3593 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3594                   poly_int64 gap)
3595 {
3596   stmt_vec_info stmt_info = dr_info->stmt;
3597   HOST_WIDE_INT count
3598     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3599   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3600     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3601   return (estimated_poly_value (gap)
3602           <= count * vect_get_scalar_dr_size (dr_info));
3603 }
3604
3605 /* Return true if we know that there is no alias between DR_INFO_A and
3606    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3607    When returning true, set *LOWER_BOUND_OUT to this N.  */
3608
3609 static bool
3610 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3611                                 poly_uint64 *lower_bound_out)
3612 {
3613   /* Check that there is a constant gap of known sign between DR_A
3614      and DR_B.  */
3615   data_reference *dr_a = dr_info_a->dr;
3616   data_reference *dr_b = dr_info_b->dr;
3617   poly_int64 init_a, init_b;
3618   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3619       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3620       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3621       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3622       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3623       || !ordered_p (init_a, init_b))
3624     return false;
3625
3626   /* Sort DR_A and DR_B by the address they access.  */
3627   if (maybe_lt (init_b, init_a))
3628     {
3629       std::swap (init_a, init_b);
3630       std::swap (dr_info_a, dr_info_b);
3631       std::swap (dr_a, dr_b);
3632     }
3633
3634   /* If the two accesses could be dependent within a scalar iteration,
3635      make sure that we'd retain their order.  */
3636   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3637       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3638     return false;
3639
3640   /* There is no alias if abs (DR_STEP) is greater than or equal to
3641      the bytes spanned by the combination of the two accesses.  */
3642   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3643   return true;
3644 }
3645
3646 /* Function vect_prune_runtime_alias_test_list.
3647
3648    Prune a list of ddrs to be tested at run-time by versioning for alias.
3649    Merge several alias checks into one if possible.
3650    Return FALSE if resulting list of ddrs is longer then allowed by
3651    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3652
3653 opt_result
3654 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3655 {
3656   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3657   hash_set <tree_pair_hash> compared_objects;
3658
3659   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3660   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3661     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3662   const vec<vec_object_pair> &check_unequal_addrs
3663     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3664   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3665   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3666
3667   ddr_p ddr;
3668   unsigned int i;
3669   tree length_factor;
3670
3671   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3672
3673   /* Step values are irrelevant for aliasing if the number of vector
3674      iterations is equal to the number of scalar iterations (which can
3675      happen for fully-SLP loops).  */
3676   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3677
3678   if (!vf_one_p)
3679     {
3680       /* Convert the checks for nonzero steps into bound tests.  */
3681       tree value;
3682       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3683         vect_check_lower_bound (loop_vinfo, value, true, 1);
3684     }
3685
3686   if (may_alias_ddrs.is_empty ())
3687     return opt_result::success ();
3688
3689   comp_alias_ddrs.create (may_alias_ddrs.length ());
3690
3691   unsigned int loop_depth
3692     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3693                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3694
3695   /* First, we collect all data ref pairs for aliasing checks.  */
3696   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3697     {
3698       poly_uint64 lower_bound;
3699       tree segment_length_a, segment_length_b;
3700       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3701       unsigned int align_a, align_b;
3702
3703       /* Ignore the alias if the VF we chose ended up being no greater
3704          than the dependence distance.  */
3705       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3706         continue;
3707
3708       if (DDR_OBJECT_A (ddr))
3709         {
3710           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3711           if (!compared_objects.add (new_pair))
3712             {
3713               if (dump_enabled_p ())
3714                 dump_printf_loc (MSG_NOTE, vect_location,
3715                                  "checking that %T and %T"
3716                                  " have different addresses\n",
3717                                  new_pair.first, new_pair.second);
3718               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3719             }
3720           continue;
3721         }
3722
3723       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3724       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3725
3726       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3727       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3728
3729       bool preserves_scalar_order_p
3730         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3731       bool ignore_step_p
3732           = (vf_one_p
3733              && (preserves_scalar_order_p
3734                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3735                                      DR_STEP (dr_info_b->dr))));
3736
3737       /* Skip the pair if inter-iteration dependencies are irrelevant
3738          and intra-iteration dependencies are guaranteed to be honored.  */
3739       if (ignore_step_p
3740           && (preserves_scalar_order_p
3741               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3742                                                  &lower_bound)))
3743         {
3744           if (dump_enabled_p ())
3745             dump_printf_loc (MSG_NOTE, vect_location,
3746                              "no need for alias check between "
3747                              "%T and %T when VF is 1\n",
3748                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3749           continue;
3750         }
3751
3752       /* See whether we can handle the alias using a bounds check on
3753          the step, and whether that's likely to be the best approach.
3754          (It might not be, for example, if the minimum step is much larger
3755          than the number of bytes handled by one vector iteration.)  */
3756       if (!ignore_step_p
3757           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3758           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3759                                              &lower_bound)
3760           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3761               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3762         {
3763           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3764           if (dump_enabled_p ())
3765             {
3766               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3767                                "%T and %T when the step %T is outside ",
3768                                DR_REF (dr_info_a->dr),
3769                                DR_REF (dr_info_b->dr),
3770                                DR_STEP (dr_info_a->dr));
3771               if (unsigned_p)
3772                 dump_printf (MSG_NOTE, "[0");
3773               else
3774                 {
3775                   dump_printf (MSG_NOTE, "(");
3776                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3777                 }
3778               dump_printf (MSG_NOTE, ", ");
3779               dump_dec (MSG_NOTE, lower_bound);
3780               dump_printf (MSG_NOTE, ")\n");
3781             }
3782           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3783                                   unsigned_p, lower_bound);
3784           continue;
3785         }
3786
3787       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3788       if (dr_group_first_a)
3789         {
3790           stmt_info_a = dr_group_first_a;
3791           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3792         }
3793
3794       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3795       if (dr_group_first_b)
3796         {
3797           stmt_info_b = dr_group_first_b;
3798           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3799         }
3800
3801       if (ignore_step_p)
3802         {
3803           segment_length_a = size_zero_node;
3804           segment_length_b = size_zero_node;
3805         }
3806       else
3807         {
3808           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3809                                 DR_STEP (dr_info_b->dr), 0))
3810             length_factor = scalar_loop_iters;
3811           else
3812             length_factor = size_int (vect_factor);
3813           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3814           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3815         }
3816       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
3817       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
3818       align_a = vect_vfa_align (dr_info_a);
3819       align_b = vect_vfa_align (dr_info_b);
3820
3821       /* See whether the alias is known at compilation time.  */
3822       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3823                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
3824           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3825                               DR_OFFSET (dr_info_b->dr), 0)
3826           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3827           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3828           && poly_int_tree_p (segment_length_a)
3829           && poly_int_tree_p (segment_length_b))
3830         {
3831           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3832                                              segment_length_a,
3833                                              segment_length_b,
3834                                              access_size_a,
3835                                              access_size_b);
3836           if (res >= 0 && dump_enabled_p ())
3837             {
3838               dump_printf_loc (MSG_NOTE, vect_location,
3839                                "can tell at compile time that %T and %T",
3840                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3841               if (res == 0)
3842                 dump_printf (MSG_NOTE, " do not alias\n");
3843               else
3844                 dump_printf (MSG_NOTE, " alias\n");
3845             }
3846
3847           if (res == 0)
3848             continue;
3849
3850           if (res == 1)
3851             return opt_result::failure_at (stmt_info_b->stmt,
3852                                            "not vectorized:"
3853                                            " compilation time alias: %G%G",
3854                                            stmt_info_a->stmt,
3855                                            stmt_info_b->stmt);
3856         }
3857
3858       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3859                             access_size_a, align_a);
3860       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3861                             access_size_b, align_b);
3862       /* Canonicalize the order to be the one that's needed for accurate
3863          RAW, WAR and WAW flags, in cases where the data references are
3864          well-ordered.  The order doesn't really matter otherwise,
3865          but we might as well be consistent.  */
3866       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3867         std::swap (dr_a, dr_b);
3868
3869       dr_with_seg_len_pair_t dr_with_seg_len_pair
3870         (dr_a, dr_b, (preserves_scalar_order_p
3871                       ? dr_with_seg_len_pair_t::WELL_ORDERED
3872                       : dr_with_seg_len_pair_t::REORDERED));
3873
3874       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3875     }
3876
3877   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3878
3879   unsigned int count = (comp_alias_ddrs.length ()
3880                         + check_unequal_addrs.length ());
3881
3882   if (count
3883       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
3884           == VECT_COST_MODEL_VERY_CHEAP))
3885     return opt_result::failure_at
3886       (vect_location, "would need a runtime alias check\n");
3887
3888   if (dump_enabled_p ())
3889     dump_printf_loc (MSG_NOTE, vect_location,
3890                      "improved number of alias checks from %d to %d\n",
3891                      may_alias_ddrs.length (), count);
3892   unsigned limit = param_vect_max_version_for_alias_checks;
3893   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
3894     limit = param_vect_max_version_for_alias_checks * 6 / 10;
3895   if (count > limit)
3896     return opt_result::failure_at
3897       (vect_location,
3898        "number of versioning for alias run-time tests exceeds %d "
3899        "(--param vect-max-version-for-alias-checks)\n", limit);
3900
3901   return opt_result::success ();
3902 }
3903
3904 /* Check whether we can use an internal function for a gather load
3905    or scatter store.  READ_P is true for loads and false for stores.
3906    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3907    the type of the memory elements being loaded or stored.  OFFSET_TYPE
3908    is the type of the offset that is being applied to the invariant
3909    base address.  SCALE is the amount by which the offset should
3910    be multiplied *after* it has been converted to address width.
3911
3912    Return true if the function is supported, storing the function id in
3913    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3914
3915 bool
3916 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3917                           tree vectype, tree memory_type, tree offset_type,
3918                           int scale, internal_fn *ifn_out,
3919                           tree *offset_vectype_out)
3920 {
3921   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3922   unsigned int element_bits = vector_element_bits (vectype);
3923   if (element_bits != memory_bits)
3924     /* For now the vector elements must be the same width as the
3925        memory elements.  */
3926     return false;
3927
3928   /* Work out which function we need.  */
3929   internal_fn ifn, alt_ifn, alt_ifn2;
3930   if (read_p)
3931     {
3932       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3933       alt_ifn = IFN_MASK_GATHER_LOAD;
3934       /* When target supports MASK_LEN_GATHER_LOAD, we always
3935          use MASK_LEN_GATHER_LOAD regardless whether len and
3936          mask are valid or not.  */
3937       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
3938     }
3939   else
3940     {
3941       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3942       alt_ifn = IFN_MASK_SCATTER_STORE;
3943       /* When target supports MASK_LEN_SCATTER_STORE, we always
3944          use MASK_LEN_SCATTER_STORE regardless whether len and
3945          mask are valid or not.  */
3946       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
3947     }
3948
3949   for (;;)
3950     {
3951       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3952       if (!offset_vectype)
3953         return false;
3954
3955       /* Test whether the target supports this combination.  */
3956       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3957                                                   offset_vectype, scale))
3958         {
3959           *ifn_out = ifn;
3960           *offset_vectype_out = offset_vectype;
3961           return true;
3962         }
3963       else if (!masked_p
3964                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
3965                                                           memory_type,
3966                                                           offset_vectype,
3967                                                           scale))
3968         {
3969           *ifn_out = alt_ifn;
3970           *offset_vectype_out = offset_vectype;
3971           return true;
3972         }
3973       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
3974                                                        memory_type,
3975                                                        offset_vectype, scale))
3976         {
3977           *ifn_out = alt_ifn2;
3978           *offset_vectype_out = offset_vectype;
3979           return true;
3980         }
3981
3982       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3983           && TYPE_PRECISION (offset_type) >= element_bits)
3984         return false;
3985
3986       offset_type = build_nonstandard_integer_type
3987         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3988     }
3989 }
3990
3991 /* STMT_INFO is a call to an internal gather load or scatter store function.
3992    Describe the operation in INFO.  */
3993
3994 static void
3995 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3996                                    gather_scatter_info *info)
3997 {
3998   gcall *call = as_a <gcall *> (stmt_info->stmt);
3999   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4000   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4001
4002   info->ifn = gimple_call_internal_fn (call);
4003   info->decl = NULL_TREE;
4004   info->base = gimple_call_arg (call, 0);
4005   info->offset = gimple_call_arg (call, 1);
4006   info->offset_dt = vect_unknown_def_type;
4007   info->offset_vectype = NULL_TREE;
4008   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4009   info->element_type = TREE_TYPE (vectype);
4010   info->memory_type = TREE_TYPE (DR_REF (dr));
4011 }
4012
4013 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4014    gather load or scatter store.  Describe the operation in *INFO if so.  */
4015
4016 bool
4017 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4018                            gather_scatter_info *info)
4019 {
4020   HOST_WIDE_INT scale = 1;
4021   poly_int64 pbitpos, pbitsize;
4022   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4023   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4024   tree offtype = NULL_TREE;
4025   tree decl = NULL_TREE, base, off;
4026   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4027   tree memory_type = TREE_TYPE (DR_REF (dr));
4028   machine_mode pmode;
4029   int punsignedp, reversep, pvolatilep = 0;
4030   internal_fn ifn;
4031   tree offset_vectype;
4032   bool masked_p = false;
4033
4034   /* See whether this is already a call to a gather/scatter internal function.
4035      If not, see whether it's a masked load or store.  */
4036   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4037   if (call && gimple_call_internal_p (call))
4038     {
4039       ifn = gimple_call_internal_fn (call);
4040       if (internal_gather_scatter_fn_p (ifn))
4041         {
4042           vect_describe_gather_scatter_call (stmt_info, info);
4043           return true;
4044         }
4045       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4046     }
4047
4048   /* True if we should aim to use internal functions rather than
4049      built-in functions.  */
4050   bool use_ifn_p = (DR_IS_READ (dr)
4051                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4052                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4053
4054   base = DR_REF (dr);
4055   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4056      see if we can use the def stmt of the address.  */
4057   if (masked_p
4058       && TREE_CODE (base) == MEM_REF
4059       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4060       && integer_zerop (TREE_OPERAND (base, 1))
4061       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4062     {
4063       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4064       if (is_gimple_assign (def_stmt)
4065           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4066         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4067     }
4068
4069   /* The gather and scatter builtins need address of the form
4070      loop_invariant + vector * {1, 2, 4, 8}
4071      or
4072      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4073      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4074      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4075      multiplications and additions in it.  To get a vector, we need
4076      a single SSA_NAME that will be defined in the loop and will
4077      contain everything that is not loop invariant and that can be
4078      vectorized.  The following code attempts to find such a preexistng
4079      SSA_NAME OFF and put the loop invariants into a tree BASE
4080      that can be gimplified before the loop.  */
4081   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4082                               &punsignedp, &reversep, &pvolatilep);
4083   if (reversep)
4084     return false;
4085
4086   /* PR 107346.  Packed structs can have fields at offsets that are not
4087      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4088   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4089     return false;
4090
4091   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4092
4093   if (TREE_CODE (base) == MEM_REF)
4094     {
4095       if (!integer_zerop (TREE_OPERAND (base, 1)))
4096         {
4097           if (off == NULL_TREE)
4098             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4099           else
4100             off = size_binop (PLUS_EXPR, off,
4101                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4102         }
4103       base = TREE_OPERAND (base, 0);
4104     }
4105   else
4106     base = build_fold_addr_expr (base);
4107
4108   if (off == NULL_TREE)
4109     off = size_zero_node;
4110
4111   /* If base is not loop invariant, either off is 0, then we start with just
4112      the constant offset in the loop invariant BASE and continue with base
4113      as OFF, otherwise give up.
4114      We could handle that case by gimplifying the addition of base + off
4115      into some SSA_NAME and use that as off, but for now punt.  */
4116   if (!expr_invariant_in_loop_p (loop, base))
4117     {
4118       if (!integer_zerop (off))
4119         return false;
4120       off = base;
4121       base = size_int (pbytepos);
4122     }
4123   /* Otherwise put base + constant offset into the loop invariant BASE
4124      and continue with OFF.  */
4125   else
4126     {
4127       base = fold_convert (sizetype, base);
4128       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4129     }
4130
4131   /* OFF at this point may be either a SSA_NAME or some tree expression
4132      from get_inner_reference.  Try to peel off loop invariants from it
4133      into BASE as long as possible.  */
4134   STRIP_NOPS (off);
4135   while (offtype == NULL_TREE)
4136     {
4137       enum tree_code code;
4138       tree op0, op1, add = NULL_TREE;
4139
4140       if (TREE_CODE (off) == SSA_NAME)
4141         {
4142           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4143
4144           if (expr_invariant_in_loop_p (loop, off))
4145             return false;
4146
4147           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4148             break;
4149
4150           op0 = gimple_assign_rhs1 (def_stmt);
4151           code = gimple_assign_rhs_code (def_stmt);
4152           op1 = gimple_assign_rhs2 (def_stmt);
4153         }
4154       else
4155         {
4156           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4157             return false;
4158           code = TREE_CODE (off);
4159           extract_ops_from_tree (off, &code, &op0, &op1);
4160         }
4161       switch (code)
4162         {
4163         case POINTER_PLUS_EXPR:
4164         case PLUS_EXPR:
4165           if (expr_invariant_in_loop_p (loop, op0))
4166             {
4167               add = op0;
4168               off = op1;
4169             do_add:
4170               add = fold_convert (sizetype, add);
4171               if (scale != 1)
4172                 add = size_binop (MULT_EXPR, add, size_int (scale));
4173               base = size_binop (PLUS_EXPR, base, add);
4174               continue;
4175             }
4176           if (expr_invariant_in_loop_p (loop, op1))
4177             {
4178               add = op1;
4179               off = op0;
4180               goto do_add;
4181             }
4182           break;
4183         case MINUS_EXPR:
4184           if (expr_invariant_in_loop_p (loop, op1))
4185             {
4186               add = fold_convert (sizetype, op1);
4187               add = size_binop (MINUS_EXPR, size_zero_node, add);
4188               off = op0;
4189               goto do_add;
4190             }
4191           break;
4192         case MULT_EXPR:
4193           if (scale == 1 && tree_fits_shwi_p (op1))
4194             {
4195               int new_scale = tree_to_shwi (op1);
4196               /* Only treat this as a scaling operation if the target
4197                  supports it for at least some offset type.  */
4198               if (use_ifn_p
4199                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4200                                                 masked_p, vectype, memory_type,
4201                                                 signed_char_type_node,
4202                                                 new_scale, &ifn,
4203                                                 &offset_vectype)
4204                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4205                                                 masked_p, vectype, memory_type,
4206                                                 unsigned_char_type_node,
4207                                                 new_scale, &ifn,
4208                                                 &offset_vectype))
4209                 break;
4210               scale = new_scale;
4211               off = op0;
4212               continue;
4213             }
4214           break;
4215         case SSA_NAME:
4216           off = op0;
4217           continue;
4218         CASE_CONVERT:
4219           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4220               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4221             break;
4222
4223           /* Don't include the conversion if the target is happy with
4224              the current offset type.  */
4225           if (use_ifn_p
4226               && TREE_CODE (off) == SSA_NAME
4227               && !POINTER_TYPE_P (TREE_TYPE (off))
4228               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4229                                            masked_p, vectype, memory_type,
4230                                            TREE_TYPE (off), scale, &ifn,
4231                                            &offset_vectype))
4232             break;
4233
4234           if (TYPE_PRECISION (TREE_TYPE (op0))
4235               == TYPE_PRECISION (TREE_TYPE (off)))
4236             {
4237               off = op0;
4238               continue;
4239             }
4240
4241           /* Include the conversion if it is widening and we're using
4242              the IFN path or the target can handle the converted from
4243              offset or the current size is not already the same as the
4244              data vector element size.  */
4245           if ((TYPE_PRECISION (TREE_TYPE (op0))
4246                < TYPE_PRECISION (TREE_TYPE (off)))
4247               && (use_ifn_p
4248                   || (DR_IS_READ (dr)
4249                       ? (targetm.vectorize.builtin_gather
4250                          && targetm.vectorize.builtin_gather (vectype,
4251                                                               TREE_TYPE (op0),
4252                                                               scale))
4253                       : (targetm.vectorize.builtin_scatter
4254                          && targetm.vectorize.builtin_scatter (vectype,
4255                                                                TREE_TYPE (op0),
4256                                                                scale)))
4257                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4258                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4259             {
4260               off = op0;
4261               offtype = TREE_TYPE (off);
4262               STRIP_NOPS (off);
4263               continue;
4264             }
4265           break;
4266         default:
4267           break;
4268         }
4269       break;
4270     }
4271
4272   /* If at the end OFF still isn't a SSA_NAME or isn't
4273      defined in the loop, punt.  */
4274   if (TREE_CODE (off) != SSA_NAME
4275       || expr_invariant_in_loop_p (loop, off))
4276     return false;
4277
4278   if (offtype == NULL_TREE)
4279     offtype = TREE_TYPE (off);
4280
4281   if (use_ifn_p)
4282     {
4283       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4284                                      vectype, memory_type, offtype, scale,
4285                                      &ifn, &offset_vectype))
4286         ifn = IFN_LAST;
4287       decl = NULL_TREE;
4288     }
4289   else
4290     {
4291       if (DR_IS_READ (dr))
4292         {
4293           if (targetm.vectorize.builtin_gather)
4294             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4295         }
4296       else
4297         {
4298           if (targetm.vectorize.builtin_scatter)
4299             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4300         }
4301       ifn = IFN_LAST;
4302       /* The offset vector type will be read from DECL when needed.  */
4303       offset_vectype = NULL_TREE;
4304     }
4305
4306   info->ifn = ifn;
4307   info->decl = decl;
4308   info->base = base;
4309   info->offset = off;
4310   info->offset_dt = vect_unknown_def_type;
4311   info->offset_vectype = offset_vectype;
4312   info->scale = scale;
4313   info->element_type = TREE_TYPE (vectype);
4314   info->memory_type = memory_type;
4315   return true;
4316 }
4317
4318 /* Find the data references in STMT, analyze them with respect to LOOP and
4319    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4320    be handled.  */
4321
4322 opt_result
4323 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4324                                vec<data_reference_p> *datarefs,
4325                                vec<int> *dataref_groups, int group_id)
4326 {
4327   /* We can ignore clobbers for dataref analysis - they are removed during
4328      loop vectorization and BB vectorization checks dependences with a
4329      stmt walk.  */
4330   if (gimple_clobber_p (stmt))
4331     return opt_result::success ();
4332
4333   if (gimple_has_volatile_ops (stmt))
4334     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4335                                    stmt);
4336
4337   if (stmt_can_throw_internal (cfun, stmt))
4338     return opt_result::failure_at (stmt,
4339                                    "not vectorized:"
4340                                    " statement can throw an exception: %G",
4341                                    stmt);
4342
4343   auto_vec<data_reference_p, 2> refs;
4344   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4345   if (!res)
4346     return res;
4347
4348   if (refs.is_empty ())
4349     return opt_result::success ();
4350
4351   if (refs.length () > 1)
4352     {
4353       while (!refs.is_empty ())
4354         free_data_ref (refs.pop ());
4355       return opt_result::failure_at (stmt,
4356                                      "not vectorized: more than one "
4357                                      "data ref in stmt: %G", stmt);
4358     }
4359
4360   data_reference_p dr = refs.pop ();
4361   if (gcall *call = dyn_cast <gcall *> (stmt))
4362     if (!gimple_call_internal_p (call)
4363         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4364             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4365       {
4366         free_data_ref (dr);
4367         return opt_result::failure_at (stmt,
4368                                        "not vectorized: dr in a call %G", stmt);
4369       }
4370
4371   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4372       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4373     {
4374       free_data_ref (dr);
4375       return opt_result::failure_at (stmt,
4376                                      "not vectorized:"
4377                                      " statement is an unsupported"
4378                                      " bitfield access %G", stmt);
4379     }
4380
4381   if (DR_BASE_ADDRESS (dr)
4382       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4383     {
4384       free_data_ref (dr);
4385       return opt_result::failure_at (stmt,
4386                                      "not vectorized:"
4387                                      " base addr of dr is a constant\n");
4388     }
4389
4390   /* Check whether this may be a SIMD lane access and adjust the
4391      DR to make it easier for us to handle it.  */
4392   if (loop
4393       && loop->simduid
4394       && (!DR_BASE_ADDRESS (dr)
4395           || !DR_OFFSET (dr)
4396           || !DR_INIT (dr)
4397           || !DR_STEP (dr)))
4398     {
4399       struct data_reference *newdr
4400         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4401                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4402       if (DR_BASE_ADDRESS (newdr)
4403           && DR_OFFSET (newdr)
4404           && DR_INIT (newdr)
4405           && DR_STEP (newdr)
4406           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4407           && integer_zerop (DR_STEP (newdr)))
4408         {
4409           tree base_address = DR_BASE_ADDRESS (newdr);
4410           tree off = DR_OFFSET (newdr);
4411           tree step = ssize_int (1);
4412           if (integer_zerop (off)
4413               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4414             {
4415               off = TREE_OPERAND (base_address, 1);
4416               base_address = TREE_OPERAND (base_address, 0);
4417             }
4418           STRIP_NOPS (off);
4419           if (TREE_CODE (off) == MULT_EXPR
4420               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4421             {
4422               step = TREE_OPERAND (off, 1);
4423               off = TREE_OPERAND (off, 0);
4424               STRIP_NOPS (off);
4425             }
4426           if (CONVERT_EXPR_P (off)
4427               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4428                   < TYPE_PRECISION (TREE_TYPE (off))))
4429             off = TREE_OPERAND (off, 0);
4430           if (TREE_CODE (off) == SSA_NAME)
4431             {
4432               gimple *def = SSA_NAME_DEF_STMT (off);
4433               /* Look through widening conversion.  */
4434               if (is_gimple_assign (def)
4435                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4436                 {
4437                   tree rhs1 = gimple_assign_rhs1 (def);
4438                   if (TREE_CODE (rhs1) == SSA_NAME
4439                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4440                       && (TYPE_PRECISION (TREE_TYPE (off))
4441                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4442                     def = SSA_NAME_DEF_STMT (rhs1);
4443                 }
4444               if (is_gimple_call (def)
4445                   && gimple_call_internal_p (def)
4446                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4447                 {
4448                   tree arg = gimple_call_arg (def, 0);
4449                   tree reft = TREE_TYPE (DR_REF (newdr));
4450                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4451                   arg = SSA_NAME_VAR (arg);
4452                   if (arg == loop->simduid
4453                       /* For now.  */
4454                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4455                     {
4456                       DR_BASE_ADDRESS (newdr) = base_address;
4457                       DR_OFFSET (newdr) = ssize_int (0);
4458                       DR_STEP (newdr) = step;
4459                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4460                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4461                       /* Mark as simd-lane access.  */
4462                       tree arg2 = gimple_call_arg (def, 1);
4463                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4464                       free_data_ref (dr);
4465                       datarefs->safe_push (newdr);
4466                       if (dataref_groups)
4467                         dataref_groups->safe_push (group_id);
4468                       return opt_result::success ();
4469                     }
4470                 }
4471             }
4472         }
4473       free_data_ref (newdr);
4474     }
4475
4476   datarefs->safe_push (dr);
4477   if (dataref_groups)
4478     dataref_groups->safe_push (group_id);
4479   return opt_result::success ();
4480 }
4481
4482 /* Function vect_analyze_data_refs.
4483
4484   Find all the data references in the loop or basic block.
4485
4486    The general structure of the analysis of data refs in the vectorizer is as
4487    follows:
4488    1- vect_analyze_data_refs(loop/bb): call
4489       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4490       in the loop/bb and their dependences.
4491    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4492    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4493    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4494
4495 */
4496
4497 opt_result
4498 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4499 {
4500   class loop *loop = NULL;
4501   unsigned int i;
4502   struct data_reference *dr;
4503   tree scalar_type;
4504
4505   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4506
4507   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4508     loop = LOOP_VINFO_LOOP (loop_vinfo);
4509
4510   /* Go through the data-refs, check that the analysis succeeded.  Update
4511      pointer from stmt_vec_info struct to DR and vectype.  */
4512
4513   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4514   FOR_EACH_VEC_ELT (datarefs, i, dr)
4515     {
4516       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4517       poly_uint64 vf;
4518
4519       gcc_assert (DR_REF (dr));
4520       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4521       gcc_assert (!stmt_info->dr_aux.dr);
4522       stmt_info->dr_aux.dr = dr;
4523       stmt_info->dr_aux.stmt = stmt_info;
4524
4525       /* Check that analysis of the data-ref succeeded.  */
4526       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4527           || !DR_STEP (dr))
4528         {
4529           bool maybe_gather
4530             = DR_IS_READ (dr)
4531               && !TREE_THIS_VOLATILE (DR_REF (dr));
4532           bool maybe_scatter
4533             = DR_IS_WRITE (dr)
4534               && !TREE_THIS_VOLATILE (DR_REF (dr));
4535
4536           /* If target supports vector gather loads or scatter stores,
4537              see if they can't be used.  */
4538           if (is_a <loop_vec_info> (vinfo)
4539               && !nested_in_vect_loop_p (loop, stmt_info))
4540             {
4541               if (maybe_gather || maybe_scatter)
4542                 {
4543                   if (maybe_gather)
4544                     gatherscatter = GATHER;
4545                   else
4546                     gatherscatter = SCATTER;
4547                 }
4548             }
4549
4550           if (gatherscatter == SG_NONE)
4551             {
4552               if (dump_enabled_p ())
4553                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4554                                  "not vectorized: data ref analysis "
4555                                  "failed %G", stmt_info->stmt);
4556               if (is_a <bb_vec_info> (vinfo))
4557                 {
4558                   /* In BB vectorization the ref can still participate
4559                      in dependence analysis, we just can't vectorize it.  */
4560                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4561                   continue;
4562                 }
4563               return opt_result::failure_at (stmt_info->stmt,
4564                                              "not vectorized:"
4565                                              " data ref analysis failed: %G",
4566                                              stmt_info->stmt);
4567             }
4568         }
4569
4570       /* See if this was detected as SIMD lane access.  */
4571       if (dr->aux == (void *)-1
4572           || dr->aux == (void *)-2
4573           || dr->aux == (void *)-3
4574           || dr->aux == (void *)-4)
4575         {
4576           if (nested_in_vect_loop_p (loop, stmt_info))
4577             return opt_result::failure_at (stmt_info->stmt,
4578                                            "not vectorized:"
4579                                            " data ref analysis failed: %G",
4580                                            stmt_info->stmt);
4581           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4582             = -(uintptr_t) dr->aux;
4583         }
4584
4585       tree base = get_base_address (DR_REF (dr));
4586       if (base && VAR_P (base) && DECL_NONALIASED (base))
4587         {
4588           if (dump_enabled_p ())
4589             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4590                              "not vectorized: base object not addressable "
4591                              "for stmt: %G", stmt_info->stmt);
4592           if (is_a <bb_vec_info> (vinfo))
4593             {
4594               /* In BB vectorization the ref can still participate
4595                  in dependence analysis, we just can't vectorize it.  */
4596               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4597               continue;
4598             }
4599           return opt_result::failure_at (stmt_info->stmt,
4600                                          "not vectorized: base object not"
4601                                          " addressable for stmt: %G",
4602                                          stmt_info->stmt);
4603         }
4604
4605       if (is_a <loop_vec_info> (vinfo)
4606           && DR_STEP (dr)
4607           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4608         {
4609           if (nested_in_vect_loop_p (loop, stmt_info))
4610             return opt_result::failure_at (stmt_info->stmt,
4611                                            "not vectorized: "
4612                                            "not suitable for strided load %G",
4613                                            stmt_info->stmt);
4614           STMT_VINFO_STRIDED_P (stmt_info) = true;
4615         }
4616
4617       /* Update DR field in stmt_vec_info struct.  */
4618
4619       /* If the dataref is in an inner-loop of the loop that is considered for
4620          for vectorization, we also want to analyze the access relative to
4621          the outer-loop (DR contains information only relative to the
4622          inner-most enclosing loop).  We do that by building a reference to the
4623          first location accessed by the inner-loop, and analyze it relative to
4624          the outer-loop.  */
4625       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4626         {
4627           /* Build a reference to the first location accessed by the
4628              inner loop: *(BASE + INIT + OFFSET).  By construction,
4629              this address must be invariant in the inner loop, so we
4630              can consider it as being used in the outer loop.  */
4631           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4632           tree offset = unshare_expr (DR_OFFSET (dr));
4633           tree init = unshare_expr (DR_INIT (dr));
4634           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4635                                           init, offset);
4636           tree init_addr = fold_build_pointer_plus (base, init_offset);
4637           tree init_ref = build_fold_indirect_ref (init_addr);
4638
4639           if (dump_enabled_p ())
4640             dump_printf_loc (MSG_NOTE, vect_location,
4641                              "analyze in outer loop: %T\n", init_ref);
4642
4643           opt_result res
4644             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4645                                     init_ref, loop, stmt_info->stmt);
4646           if (!res)
4647             /* dr_analyze_innermost already explained the failure.  */
4648             return res;
4649
4650           if (dump_enabled_p ())
4651             dump_printf_loc (MSG_NOTE, vect_location,
4652                              "\touter base_address: %T\n"
4653                              "\touter offset from base address: %T\n"
4654                              "\touter constant offset from base address: %T\n"
4655                              "\touter step: %T\n"
4656                              "\touter base alignment: %d\n\n"
4657                              "\touter base misalignment: %d\n"
4658                              "\touter offset alignment: %d\n"
4659                              "\touter step alignment: %d\n",
4660                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4661                              STMT_VINFO_DR_OFFSET (stmt_info),
4662                              STMT_VINFO_DR_INIT (stmt_info),
4663                              STMT_VINFO_DR_STEP (stmt_info),
4664                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4665                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4666                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4667                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4668         }
4669
4670       /* Set vectype for STMT.  */
4671       scalar_type = TREE_TYPE (DR_REF (dr));
4672       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4673       if (!vectype)
4674         {
4675           if (dump_enabled_p ())
4676             {
4677               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4678                                "not vectorized: no vectype for stmt: %G",
4679                                stmt_info->stmt);
4680               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4681               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4682                                  scalar_type);
4683               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4684             }
4685
4686           if (is_a <bb_vec_info> (vinfo))
4687             {
4688               /* No vector type is fine, the ref can still participate
4689                  in dependence analysis, we just can't vectorize it.  */
4690               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4691               continue;
4692             }
4693           if (fatal)
4694             *fatal = false;
4695           return opt_result::failure_at (stmt_info->stmt,
4696                                          "not vectorized:"
4697                                          " no vectype for stmt: %G"
4698                                          " scalar_type: %T\n",
4699                                          stmt_info->stmt, scalar_type);
4700         }
4701       else
4702         {
4703           if (dump_enabled_p ())
4704             dump_printf_loc (MSG_NOTE, vect_location,
4705                              "got vectype for stmt: %G%T\n",
4706                              stmt_info->stmt, vectype);
4707         }
4708
4709       /* Adjust the minimal vectorization factor according to the
4710          vector type.  */
4711       vf = TYPE_VECTOR_SUBPARTS (vectype);
4712       *min_vf = upper_bound (*min_vf, vf);
4713
4714       /* Leave the BB vectorizer to pick the vector type later, based on
4715          the final dataref group size and SLP node size.  */
4716       if (is_a <loop_vec_info> (vinfo))
4717         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4718
4719       if (gatherscatter != SG_NONE)
4720         {
4721           gather_scatter_info gs_info;
4722           if (!vect_check_gather_scatter (stmt_info,
4723                                           as_a <loop_vec_info> (vinfo),
4724                                           &gs_info)
4725               || !get_vectype_for_scalar_type (vinfo,
4726                                                TREE_TYPE (gs_info.offset)))
4727             {
4728               if (fatal)
4729                 *fatal = false;
4730               return opt_result::failure_at
4731                         (stmt_info->stmt,
4732                          (gatherscatter == GATHER)
4733                          ? "not vectorized: not suitable for gather load %G"
4734                          : "not vectorized: not suitable for scatter store %G",
4735                          stmt_info->stmt);
4736             }
4737           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4738         }
4739     }
4740
4741   /* We used to stop processing and prune the list here.  Verify we no
4742      longer need to.  */
4743   gcc_assert (i == datarefs.length ());
4744
4745   return opt_result::success ();
4746 }
4747
4748
4749 /* Function vect_get_new_vect_var.
4750
4751    Returns a name for a new variable.  The current naming scheme appends the
4752    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4753    the name of vectorizer generated variables, and appends that to NAME if
4754    provided.  */
4755
4756 tree
4757 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4758 {
4759   const char *prefix;
4760   tree new_vect_var;
4761
4762   switch (var_kind)
4763   {
4764   case vect_simple_var:
4765     prefix = "vect";
4766     break;
4767   case vect_scalar_var:
4768     prefix = "stmp";
4769     break;
4770   case vect_mask_var:
4771     prefix = "mask";
4772     break;
4773   case vect_pointer_var:
4774     prefix = "vectp";
4775     break;
4776   default:
4777     gcc_unreachable ();
4778   }
4779
4780   if (name)
4781     {
4782       char* tmp = concat (prefix, "_", name, NULL);
4783       new_vect_var = create_tmp_reg (type, tmp);
4784       free (tmp);
4785     }
4786   else
4787     new_vect_var = create_tmp_reg (type, prefix);
4788
4789   return new_vect_var;
4790 }
4791
4792 /* Like vect_get_new_vect_var but return an SSA name.  */
4793
4794 tree
4795 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4796 {
4797   const char *prefix;
4798   tree new_vect_var;
4799
4800   switch (var_kind)
4801   {
4802   case vect_simple_var:
4803     prefix = "vect";
4804     break;
4805   case vect_scalar_var:
4806     prefix = "stmp";
4807     break;
4808   case vect_pointer_var:
4809     prefix = "vectp";
4810     break;
4811   default:
4812     gcc_unreachable ();
4813   }
4814
4815   if (name)
4816     {
4817       char* tmp = concat (prefix, "_", name, NULL);
4818       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4819       free (tmp);
4820     }
4821   else
4822     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4823
4824   return new_vect_var;
4825 }
4826
4827 /* Duplicate points-to info on NAME from DR_INFO.  */
4828
4829 static void
4830 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4831 {
4832   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4833   /* DR_PTR_INFO is for a base SSA name, not including constant or
4834      variable offsets in the ref so its alignment info does not apply.  */
4835   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4836 }
4837
4838 /* Function vect_create_addr_base_for_vector_ref.
4839
4840    Create an expression that computes the address of the first memory location
4841    that will be accessed for a data reference.
4842
4843    Input:
4844    STMT_INFO: The statement containing the data reference.
4845    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4846    OFFSET: Optional. If supplied, it is be added to the initial address.
4847    LOOP:    Specify relative to which loop-nest should the address be computed.
4848             For example, when the dataref is in an inner-loop nested in an
4849             outer-loop that is now being vectorized, LOOP can be either the
4850             outer-loop, or the inner-loop.  The first memory location accessed
4851             by the following dataref ('in' points to short):
4852
4853                 for (i=0; i<N; i++)
4854                    for (j=0; j<M; j++)
4855                      s += in[i+j]
4856
4857             is as follows:
4858             if LOOP=i_loop:     &in             (relative to i_loop)
4859             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4860
4861    Output:
4862    1. Return an SSA_NAME whose value is the address of the memory location of
4863       the first vector of the data reference.
4864    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4865       these statement(s) which define the returned SSA_NAME.
4866
4867    FORNOW: We are only handling array accesses with step 1.  */
4868
4869 tree
4870 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
4871                                       gimple_seq *new_stmt_list,
4872                                       tree offset)
4873 {
4874   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4875   struct data_reference *dr = dr_info->dr;
4876   const char *base_name;
4877   tree addr_base;
4878   tree dest;
4879   gimple_seq seq = NULL;
4880   tree vect_ptr_type;
4881   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4882   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
4883
4884   tree data_ref_base = unshare_expr (drb->base_address);
4885   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
4886   tree init = unshare_expr (drb->init);
4887
4888   if (loop_vinfo)
4889     base_name = get_name (data_ref_base);
4890   else
4891     {
4892       base_offset = ssize_int (0);
4893       init = ssize_int (0);
4894       base_name = get_name (DR_REF (dr));
4895     }
4896
4897   /* Create base_offset */
4898   base_offset = size_binop (PLUS_EXPR,
4899                             fold_convert (sizetype, base_offset),
4900                             fold_convert (sizetype, init));
4901
4902   if (offset)
4903     {
4904       offset = fold_convert (sizetype, offset);
4905       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4906                                  base_offset, offset);
4907     }
4908
4909   /* base + base_offset */
4910   if (loop_vinfo)
4911     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4912   else
4913     addr_base = build1 (ADDR_EXPR,
4914                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
4915                         /* Strip zero offset components since we don't need
4916                            them and they can confuse late diagnostics if
4917                            we CSE them wrongly.  See PR106904 for example.  */
4918                         unshare_expr (strip_zero_offset_components
4919                                                                 (DR_REF (dr))));
4920
4921   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
4922   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4923   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4924   gimple_seq_add_seq (new_stmt_list, seq);
4925
4926   if (DR_PTR_INFO (dr)
4927       && TREE_CODE (addr_base) == SSA_NAME
4928       /* We should only duplicate pointer info to newly created SSA names.  */
4929       && SSA_NAME_VAR (addr_base) == dest)
4930     {
4931       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
4932       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4933     }
4934
4935   if (dump_enabled_p ())
4936     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4937
4938   return addr_base;
4939 }
4940
4941
4942 /* Function vect_create_data_ref_ptr.
4943
4944    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4945    location accessed in the loop by STMT_INFO, along with the def-use update
4946    chain to appropriately advance the pointer through the loop iterations.
4947    Also set aliasing information for the pointer.  This pointer is used by
4948    the callers to this function to create a memory reference expression for
4949    vector load/store access.
4950
4951    Input:
4952    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4953          GIMPLE_ASSIGN <name, data-ref> or
4954          GIMPLE_ASSIGN <data-ref, name>.
4955    2. AGGR_TYPE: the type of the reference, which should be either a vector
4956         or an array.
4957    3. AT_LOOP: the loop where the vector memref is to be created.
4958    4. OFFSET (optional): a byte offset to be added to the initial address
4959         accessed by the data-ref in STMT_INFO.
4960    5. BSI: location where the new stmts are to be placed if there is no loop
4961    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4962         pointing to the initial address.
4963    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4964         to the IV during each iteration of the loop.  NULL says to move
4965         by one copy of AGGR_TYPE up or down, depending on the step of the
4966         data reference.
4967
4968    Output:
4969    1. Declare a new ptr to vector_type, and have it point to the base of the
4970       data reference (initial addressed accessed by the data reference).
4971       For example, for vector of type V8HI, the following code is generated:
4972
4973       v8hi *ap;
4974       ap = (v8hi *)initial_address;
4975
4976       if OFFSET is not supplied:
4977          initial_address = &a[init];
4978       if OFFSET is supplied:
4979          initial_address = &a[init] + OFFSET;
4980       if BYTE_OFFSET is supplied:
4981          initial_address = &a[init] + BYTE_OFFSET;
4982
4983       Return the initial_address in INITIAL_ADDRESS.
4984
4985    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4986       update the pointer in each iteration of the loop.
4987
4988       Return the increment stmt that updates the pointer in PTR_INCR.
4989
4990    3. Return the pointer.  */
4991
4992 tree
4993 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
4994                           tree aggr_type, class loop *at_loop, tree offset,
4995                           tree *initial_address, gimple_stmt_iterator *gsi,
4996                           gimple **ptr_incr, bool only_init,
4997                           tree iv_step)
4998 {
4999   const char *base_name;
5000   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5001   class loop *loop = NULL;
5002   bool nested_in_vect_loop = false;
5003   class loop *containing_loop = NULL;
5004   tree aggr_ptr_type;
5005   tree aggr_ptr;
5006   tree new_temp;
5007   gimple_seq new_stmt_list = NULL;
5008   edge pe = NULL;
5009   basic_block new_bb;
5010   tree aggr_ptr_init;
5011   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5012   struct data_reference *dr = dr_info->dr;
5013   tree aptr;
5014   gimple_stmt_iterator incr_gsi;
5015   bool insert_after;
5016   tree indx_before_incr, indx_after_incr;
5017   gimple *incr;
5018   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5019
5020   gcc_assert (iv_step != NULL_TREE
5021               || TREE_CODE (aggr_type) == ARRAY_TYPE
5022               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5023
5024   if (loop_vinfo)
5025     {
5026       loop = LOOP_VINFO_LOOP (loop_vinfo);
5027       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5028       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5029       pe = loop_preheader_edge (loop);
5030     }
5031   else
5032     {
5033       gcc_assert (bb_vinfo);
5034       only_init = true;
5035       *ptr_incr = NULL;
5036     }
5037
5038   /* Create an expression for the first address accessed by this load
5039      in LOOP.  */
5040   base_name = get_name (DR_BASE_ADDRESS (dr));
5041
5042   if (dump_enabled_p ())
5043     {
5044       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5045       dump_printf_loc (MSG_NOTE, vect_location,
5046                        "create %s-pointer variable to type: %T",
5047                        get_tree_code_name (TREE_CODE (aggr_type)),
5048                        aggr_type);
5049       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5050         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5051       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5052         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5053       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5054         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5055       else
5056         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5057       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5058     }
5059
5060   /* (1) Create the new aggregate-pointer variable.
5061      Vector and array types inherit the alias set of their component
5062      type by default so we need to use a ref-all pointer if the data
5063      reference does not conflict with the created aggregated data
5064      reference because it is not addressable.  */
5065   bool need_ref_all = false;
5066   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5067                               get_alias_set (DR_REF (dr))))
5068     need_ref_all = true;
5069   /* Likewise for any of the data references in the stmt group.  */
5070   else if (DR_GROUP_SIZE (stmt_info) > 1)
5071     {
5072       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5073       do
5074         {
5075           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5076           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5077                                       get_alias_set (DR_REF (sdr))))
5078             {
5079               need_ref_all = true;
5080               break;
5081             }
5082           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5083         }
5084       while (sinfo);
5085     }
5086   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5087                                                need_ref_all);
5088   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5089
5090
5091   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5092      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5093      def-use update cycles for the pointer: one relative to the outer-loop
5094      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5095      to the inner-loop (which is the inner-most loop containing the dataref),
5096      and this is done be step (5) below.
5097
5098      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5099      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5100      redundant.  Steps (3),(4) create the following:
5101
5102         vp0 = &base_addr;
5103         LOOP:   vp1 = phi(vp0,vp2)
5104                 ...
5105                 ...
5106                 vp2 = vp1 + step
5107                 goto LOOP
5108
5109      If there is an inner-loop nested in loop, then step (5) will also be
5110      applied, and an additional update in the inner-loop will be created:
5111
5112         vp0 = &base_addr;
5113         LOOP:   vp1 = phi(vp0,vp2)
5114                 ...
5115         inner:     vp3 = phi(vp1,vp4)
5116                    vp4 = vp3 + inner_step
5117                    if () goto inner
5118                 ...
5119                 vp2 = vp1 + step
5120                 if () goto LOOP   */
5121
5122   /* (2) Calculate the initial address of the aggregate-pointer, and set
5123      the aggregate-pointer to point to it before the loop.  */
5124
5125   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5126
5127   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5128                                                    stmt_info, &new_stmt_list,
5129                                                    offset);
5130   if (new_stmt_list)
5131     {
5132       if (pe)
5133         {
5134           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5135           gcc_assert (!new_bb);
5136         }
5137       else
5138         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5139     }
5140
5141   *initial_address = new_temp;
5142   aggr_ptr_init = new_temp;
5143
5144   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5145      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5146      inner-loop nested in LOOP (during outer-loop vectorization).  */
5147
5148   /* No update in loop is required.  */
5149   if (only_init && (!loop_vinfo || at_loop == loop))
5150     aptr = aggr_ptr_init;
5151   else
5152     {
5153       /* Accesses to invariant addresses should be handled specially
5154          by the caller.  */
5155       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5156       gcc_assert (!integer_zerop (step));
5157
5158       if (iv_step == NULL_TREE)
5159         {
5160           /* The step of the aggregate pointer is the type size,
5161              negated for downward accesses.  */
5162           iv_step = TYPE_SIZE_UNIT (aggr_type);
5163           if (tree_int_cst_sgn (step) == -1)
5164             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5165         }
5166
5167       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5168
5169       create_iv (aggr_ptr_init, PLUS_EXPR,
5170                  fold_convert (aggr_ptr_type, iv_step),
5171                  aggr_ptr, loop, &incr_gsi, insert_after,
5172                  &indx_before_incr, &indx_after_incr);
5173       incr = gsi_stmt (incr_gsi);
5174
5175       /* Copy the points-to information if it exists. */
5176       if (DR_PTR_INFO (dr))
5177         {
5178           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5179           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5180         }
5181       if (ptr_incr)
5182         *ptr_incr = incr;
5183
5184       aptr = indx_before_incr;
5185     }
5186
5187   if (!nested_in_vect_loop || only_init)
5188     return aptr;
5189
5190
5191   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5192      nested in LOOP, if exists.  */
5193
5194   gcc_assert (nested_in_vect_loop);
5195   if (!only_init)
5196     {
5197       standard_iv_increment_position (containing_loop, &incr_gsi,
5198                                       &insert_after);
5199       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5200                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5201                  &indx_before_incr, &indx_after_incr);
5202       incr = gsi_stmt (incr_gsi);
5203
5204       /* Copy the points-to information if it exists. */
5205       if (DR_PTR_INFO (dr))
5206         {
5207           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5208           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5209         }
5210       if (ptr_incr)
5211         *ptr_incr = incr;
5212
5213       return indx_before_incr;
5214     }
5215   else
5216     gcc_unreachable ();
5217 }
5218
5219
5220 /* Function bump_vector_ptr
5221
5222    Increment a pointer (to a vector type) by vector-size. If requested,
5223    i.e. if PTR-INCR is given, then also connect the new increment stmt
5224    to the existing def-use update-chain of the pointer, by modifying
5225    the PTR_INCR as illustrated below:
5226
5227    The pointer def-use update-chain before this function:
5228                         DATAREF_PTR = phi (p_0, p_2)
5229                         ....
5230         PTR_INCR:       p_2 = DATAREF_PTR + step
5231
5232    The pointer def-use update-chain after this function:
5233                         DATAREF_PTR = phi (p_0, p_2)
5234                         ....
5235                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5236                         ....
5237         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5238
5239    Input:
5240    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5241                  in the loop.
5242    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5243               the loop.  The increment amount across iterations is expected
5244               to be vector_size.
5245    BSI - location where the new update stmt is to be placed.
5246    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5247    BUMP - optional. The offset by which to bump the pointer. If not given,
5248           the offset is assumed to be vector_size.
5249
5250    Output: Return NEW_DATAREF_PTR as illustrated above.
5251
5252 */
5253
5254 tree
5255 bump_vector_ptr (vec_info *vinfo,
5256                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5257                  stmt_vec_info stmt_info, tree bump)
5258 {
5259   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5260   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5261   tree update = TYPE_SIZE_UNIT (vectype);
5262   gimple *incr_stmt;
5263   ssa_op_iter iter;
5264   use_operand_p use_p;
5265   tree new_dataref_ptr;
5266
5267   if (bump)
5268     update = bump;
5269
5270   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5271     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5272   else if (is_gimple_min_invariant (dataref_ptr))
5273     /* When possible avoid emitting a separate increment stmt that will
5274        force the addressed object addressable.  */
5275     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5276                    fold_build2 (MEM_REF,
5277                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5278                                 dataref_ptr,
5279                                 fold_convert (ptr_type_node, update)));
5280   else
5281     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5282   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5283                                    dataref_ptr, update);
5284   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5285   /* Fold the increment, avoiding excessive chains use-def chains of
5286      those, leading to compile-time issues for passes until the next
5287      forwprop pass which would do this as well.  */
5288   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5289   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5290     {
5291       incr_stmt = gsi_stmt (fold_gsi);
5292       update_stmt (incr_stmt);
5293     }
5294
5295   /* Copy the points-to information if it exists. */
5296   if (DR_PTR_INFO (dr))
5297     {
5298       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5299       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5300     }
5301
5302   if (!ptr_incr)
5303     return new_dataref_ptr;
5304
5305   /* Update the vector-pointer's cross-iteration increment.  */
5306   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5307     {
5308       tree use = USE_FROM_PTR (use_p);
5309
5310       if (use == dataref_ptr)
5311         SET_USE (use_p, new_dataref_ptr);
5312       else
5313         gcc_assert (operand_equal_p (use, update, 0));
5314     }
5315
5316   return new_dataref_ptr;
5317 }
5318
5319
5320 /* Copy memory reference info such as base/clique from the SRC reference
5321    to the DEST MEM_REF.  */
5322
5323 void
5324 vect_copy_ref_info (tree dest, tree src)
5325 {
5326   if (TREE_CODE (dest) != MEM_REF)
5327     return;
5328
5329   tree src_base = src;
5330   while (handled_component_p (src_base))
5331     src_base = TREE_OPERAND (src_base, 0);
5332   if (TREE_CODE (src_base) != MEM_REF
5333       && TREE_CODE (src_base) != TARGET_MEM_REF)
5334     return;
5335
5336   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5337   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5338 }
5339
5340
5341 /* Function vect_create_destination_var.
5342
5343    Create a new temporary of type VECTYPE.  */
5344
5345 tree
5346 vect_create_destination_var (tree scalar_dest, tree vectype)
5347 {
5348   tree vec_dest;
5349   const char *name;
5350   char *new_name;
5351   tree type;
5352   enum vect_var_kind kind;
5353
5354   kind = vectype
5355     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5356     ? vect_mask_var
5357     : vect_simple_var
5358     : vect_scalar_var;
5359   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5360
5361   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5362
5363   name = get_name (scalar_dest);
5364   if (name)
5365     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5366   else
5367     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5368   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5369   free (new_name);
5370
5371   return vec_dest;
5372 }
5373
5374 /* Function vect_grouped_store_supported.
5375
5376    Returns TRUE if interleave high and interleave low permutations
5377    are supported, and FALSE otherwise.  */
5378
5379 bool
5380 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5381 {
5382   machine_mode mode = TYPE_MODE (vectype);
5383
5384   /* vect_permute_store_chain requires the group size to be equal to 3 or
5385      be a power of two.  */
5386   if (count != 3 && exact_log2 (count) == -1)
5387     {
5388       if (dump_enabled_p ())
5389         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5390                          "the size of the group of accesses"
5391                          " is not a power of 2 or not eqaul to 3\n");
5392       return false;
5393     }
5394
5395   /* Check that the permutation is supported.  */
5396   if (VECTOR_MODE_P (mode))
5397     {
5398       unsigned int i;
5399       if (count == 3)
5400         {
5401           unsigned int j0 = 0, j1 = 0, j2 = 0;
5402           unsigned int i, j;
5403
5404           unsigned int nelt;
5405           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5406             {
5407               if (dump_enabled_p ())
5408                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5409                                  "cannot handle groups of 3 stores for"
5410                                  " variable-length vectors\n");
5411               return false;
5412             }
5413
5414           vec_perm_builder sel (nelt, nelt, 1);
5415           sel.quick_grow (nelt);
5416           vec_perm_indices indices;
5417           for (j = 0; j < 3; j++)
5418             {
5419               int nelt0 = ((3 - j) * nelt) % 3;
5420               int nelt1 = ((3 - j) * nelt + 1) % 3;
5421               int nelt2 = ((3 - j) * nelt + 2) % 3;
5422               for (i = 0; i < nelt; i++)
5423                 {
5424                   if (3 * i + nelt0 < nelt)
5425                     sel[3 * i + nelt0] = j0++;
5426                   if (3 * i + nelt1 < nelt)
5427                     sel[3 * i + nelt1] = nelt + j1++;
5428                   if (3 * i + nelt2 < nelt)
5429                     sel[3 * i + nelt2] = 0;
5430                 }
5431               indices.new_vector (sel, 2, nelt);
5432               if (!can_vec_perm_const_p (mode, mode, indices))
5433                 {
5434                   if (dump_enabled_p ())
5435                     dump_printf (MSG_MISSED_OPTIMIZATION,
5436                                  "permutation op not supported by target.\n");
5437                   return false;
5438                 }
5439
5440               for (i = 0; i < nelt; i++)
5441                 {
5442                   if (3 * i + nelt0 < nelt)
5443                     sel[3 * i + nelt0] = 3 * i + nelt0;
5444                   if (3 * i + nelt1 < nelt)
5445                     sel[3 * i + nelt1] = 3 * i + nelt1;
5446                   if (3 * i + nelt2 < nelt)
5447                     sel[3 * i + nelt2] = nelt + j2++;
5448                 }
5449               indices.new_vector (sel, 2, nelt);
5450               if (!can_vec_perm_const_p (mode, mode, indices))
5451                 {
5452                   if (dump_enabled_p ())
5453                     dump_printf (MSG_MISSED_OPTIMIZATION,
5454                                  "permutation op not supported by target.\n");
5455                   return false;
5456                 }
5457             }
5458           return true;
5459         }
5460       else
5461         {
5462           /* If length is not equal to 3 then only power of 2 is supported.  */
5463           gcc_assert (pow2p_hwi (count));
5464           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5465
5466           /* The encoding has 2 interleaved stepped patterns.  */
5467           if(!multiple_p (nelt, 2))
5468             return false;
5469           vec_perm_builder sel (nelt, 2, 3);
5470           sel.quick_grow (6);
5471           for (i = 0; i < 3; i++)
5472             {
5473               sel[i * 2] = i;
5474               sel[i * 2 + 1] = i + nelt;
5475             }
5476           vec_perm_indices indices (sel, 2, nelt);
5477           if (can_vec_perm_const_p (mode, mode, indices))
5478             {
5479               for (i = 0; i < 6; i++)
5480                 sel[i] += exact_div (nelt, 2);
5481               indices.new_vector (sel, 2, nelt);
5482               if (can_vec_perm_const_p (mode, mode, indices))
5483                 return true;
5484             }
5485         }
5486     }
5487
5488   if (dump_enabled_p ())
5489     dump_printf (MSG_MISSED_OPTIMIZATION,
5490                  "permutation op not supported by target.\n");
5491   return false;
5492 }
5493
5494 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5495    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5496
5497 internal_fn
5498 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5499                             bool masked_p)
5500 {
5501   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5502                                     vec_mask_len_store_lanes_optab, vectype,
5503                                     count))
5504     return IFN_MASK_LEN_STORE_LANES;
5505   else if (masked_p)
5506     {
5507       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5508                                         vec_mask_store_lanes_optab, vectype,
5509                                         count))
5510         return IFN_MASK_STORE_LANES;
5511     }
5512   else
5513     {
5514       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5515                                         vec_store_lanes_optab, vectype, count))
5516         return IFN_STORE_LANES;
5517     }
5518   return IFN_LAST;
5519 }
5520
5521
5522 /* Function vect_permute_store_chain.
5523
5524    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5525    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5526    the data correctly for the stores.  Return the final references for stores
5527    in RESULT_CHAIN.
5528
5529    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5530    The input is 4 vectors each containing 8 elements.  We assign a number to
5531    each element, the input sequence is:
5532
5533    1st vec:   0  1  2  3  4  5  6  7
5534    2nd vec:   8  9 10 11 12 13 14 15
5535    3rd vec:  16 17 18 19 20 21 22 23
5536    4th vec:  24 25 26 27 28 29 30 31
5537
5538    The output sequence should be:
5539
5540    1st vec:  0  8 16 24  1  9 17 25
5541    2nd vec:  2 10 18 26  3 11 19 27
5542    3rd vec:  4 12 20 28  5 13 21 30
5543    4th vec:  6 14 22 30  7 15 23 31
5544
5545    i.e., we interleave the contents of the four vectors in their order.
5546
5547    We use interleave_high/low instructions to create such output.  The input of
5548    each interleave_high/low operation is two vectors:
5549    1st vec    2nd vec
5550    0 1 2 3    4 5 6 7
5551    the even elements of the result vector are obtained left-to-right from the
5552    high/low elements of the first vector.  The odd elements of the result are
5553    obtained left-to-right from the high/low elements of the second vector.
5554    The output of interleave_high will be:   0 4 1 5
5555    and of interleave_low:                   2 6 3 7
5556
5557
5558    The permutation is done in log LENGTH stages.  In each stage interleave_high
5559    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5560    where the first argument is taken from the first half of DR_CHAIN and the
5561    second argument from it's second half.
5562    In our example,
5563
5564    I1: interleave_high (1st vec, 3rd vec)
5565    I2: interleave_low (1st vec, 3rd vec)
5566    I3: interleave_high (2nd vec, 4th vec)
5567    I4: interleave_low (2nd vec, 4th vec)
5568
5569    The output for the first stage is:
5570
5571    I1:  0 16  1 17  2 18  3 19
5572    I2:  4 20  5 21  6 22  7 23
5573    I3:  8 24  9 25 10 26 11 27
5574    I4: 12 28 13 29 14 30 15 31
5575
5576    The output of the second stage, i.e. the final result is:
5577
5578    I1:  0  8 16 24  1  9 17 25
5579    I2:  2 10 18 26  3 11 19 27
5580    I3:  4 12 20 28  5 13 21 30
5581    I4:  6 14 22 30  7 15 23 31.  */
5582
5583 void
5584 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5585                           unsigned int length,
5586                           stmt_vec_info stmt_info,
5587                           gimple_stmt_iterator *gsi,
5588                           vec<tree> *result_chain)
5589 {
5590   tree vect1, vect2, high, low;
5591   gimple *perm_stmt;
5592   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5593   tree perm_mask_low, perm_mask_high;
5594   tree data_ref;
5595   tree perm3_mask_low, perm3_mask_high;
5596   unsigned int i, j, n, log_length = exact_log2 (length);
5597
5598   result_chain->quick_grow (length);
5599   memcpy (result_chain->address (), dr_chain.address (),
5600           length * sizeof (tree));
5601
5602   if (length == 3)
5603     {
5604       /* vect_grouped_store_supported ensures that this is constant.  */
5605       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5606       unsigned int j0 = 0, j1 = 0, j2 = 0;
5607
5608       vec_perm_builder sel (nelt, nelt, 1);
5609       sel.quick_grow (nelt);
5610       vec_perm_indices indices;
5611       for (j = 0; j < 3; j++)
5612         {
5613           int nelt0 = ((3 - j) * nelt) % 3;
5614           int nelt1 = ((3 - j) * nelt + 1) % 3;
5615           int nelt2 = ((3 - j) * nelt + 2) % 3;
5616
5617           for (i = 0; i < nelt; i++)
5618             {
5619               if (3 * i + nelt0 < nelt)
5620                 sel[3 * i + nelt0] = j0++;
5621               if (3 * i + nelt1 < nelt)
5622                 sel[3 * i + nelt1] = nelt + j1++;
5623               if (3 * i + nelt2 < nelt)
5624                 sel[3 * i + nelt2] = 0;
5625             }
5626           indices.new_vector (sel, 2, nelt);
5627           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5628
5629           for (i = 0; i < nelt; i++)
5630             {
5631               if (3 * i + nelt0 < nelt)
5632                 sel[3 * i + nelt0] = 3 * i + nelt0;
5633               if (3 * i + nelt1 < nelt)
5634                 sel[3 * i + nelt1] = 3 * i + nelt1;
5635               if (3 * i + nelt2 < nelt)
5636                 sel[3 * i + nelt2] = nelt + j2++;
5637             }
5638           indices.new_vector (sel, 2, nelt);
5639           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5640
5641           vect1 = dr_chain[0];
5642           vect2 = dr_chain[1];
5643
5644           /* Create interleaving stmt:
5645              low = VEC_PERM_EXPR <vect1, vect2,
5646                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5647                                    j + 2, nelt + j + 2, *, ...}>  */
5648           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5649           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5650                                            vect2, perm3_mask_low);
5651           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5652
5653           vect1 = data_ref;
5654           vect2 = dr_chain[2];
5655           /* Create interleaving stmt:
5656              low = VEC_PERM_EXPR <vect1, vect2,
5657                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5658                                    6, 7, nelt + j + 2, ...}>  */
5659           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5660           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5661                                            vect2, perm3_mask_high);
5662           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5663           (*result_chain)[j] = data_ref;
5664         }
5665     }
5666   else
5667     {
5668       /* If length is not equal to 3 then only power of 2 is supported.  */
5669       gcc_assert (pow2p_hwi (length));
5670
5671       /* The encoding has 2 interleaved stepped patterns.  */
5672       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5673       vec_perm_builder sel (nelt, 2, 3);
5674       sel.quick_grow (6);
5675       for (i = 0; i < 3; i++)
5676         {
5677           sel[i * 2] = i;
5678           sel[i * 2 + 1] = i + nelt;
5679         }
5680         vec_perm_indices indices (sel, 2, nelt);
5681         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5682
5683         for (i = 0; i < 6; i++)
5684           sel[i] += exact_div (nelt, 2);
5685         indices.new_vector (sel, 2, nelt);
5686         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5687
5688         for (i = 0, n = log_length; i < n; i++)
5689           {
5690             for (j = 0; j < length/2; j++)
5691               {
5692                 vect1 = dr_chain[j];
5693                 vect2 = dr_chain[j+length/2];
5694
5695                 /* Create interleaving stmt:
5696                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5697                                                         ...}>  */
5698                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5699                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5700                                                  vect2, perm_mask_high);
5701                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5702                 (*result_chain)[2*j] = high;
5703
5704                 /* Create interleaving stmt:
5705                    low = VEC_PERM_EXPR <vect1, vect2,
5706                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5707                                          ...}>  */
5708                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5709                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5710                                                  vect2, perm_mask_low);
5711                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5712                 (*result_chain)[2*j+1] = low;
5713               }
5714             memcpy (dr_chain.address (), result_chain->address (),
5715                     length * sizeof (tree));
5716           }
5717     }
5718 }
5719
5720 /* Function vect_setup_realignment
5721
5722    This function is called when vectorizing an unaligned load using
5723    the dr_explicit_realign[_optimized] scheme.
5724    This function generates the following code at the loop prolog:
5725
5726       p = initial_addr;
5727    x  msq_init = *(floor(p));   # prolog load
5728       realignment_token = call target_builtin;
5729     loop:
5730    x  msq = phi (msq_init, ---)
5731
5732    The stmts marked with x are generated only for the case of
5733    dr_explicit_realign_optimized.
5734
5735    The code above sets up a new (vector) pointer, pointing to the first
5736    location accessed by STMT_INFO, and a "floor-aligned" load using that
5737    pointer.  It also generates code to compute the "realignment-token"
5738    (if the relevant target hook was defined), and creates a phi-node at the
5739    loop-header bb whose arguments are the result of the prolog-load (created
5740    by this function) and the result of a load that takes place in the loop
5741    (to be created by the caller to this function).
5742
5743    For the case of dr_explicit_realign_optimized:
5744    The caller to this function uses the phi-result (msq) to create the
5745    realignment code inside the loop, and sets up the missing phi argument,
5746    as follows:
5747     loop:
5748       msq = phi (msq_init, lsq)
5749       lsq = *(floor(p'));        # load in loop
5750       result = realign_load (msq, lsq, realignment_token);
5751
5752    For the case of dr_explicit_realign:
5753     loop:
5754       msq = *(floor(p));        # load in loop
5755       p' = p + (VS-1);
5756       lsq = *(floor(p'));       # load in loop
5757       result = realign_load (msq, lsq, realignment_token);
5758
5759    Input:
5760    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5761                a memory location that may be unaligned.
5762    BSI - place where new code is to be inserted.
5763    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5764                               is used.
5765
5766    Output:
5767    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5768                        target hook, if defined.
5769    Return value - the result of the loop-header phi node.  */
5770
5771 tree
5772 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
5773                         gimple_stmt_iterator *gsi, tree *realignment_token,
5774                         enum dr_alignment_support alignment_support_scheme,
5775                         tree init_addr,
5776                         class loop **at_loop)
5777 {
5778   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5779   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5780   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5781   struct data_reference *dr = dr_info->dr;
5782   class loop *loop = NULL;
5783   edge pe = NULL;
5784   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5785   tree vec_dest;
5786   gimple *inc;
5787   tree ptr;
5788   tree data_ref;
5789   basic_block new_bb;
5790   tree msq_init = NULL_TREE;
5791   tree new_temp;
5792   gphi *phi_stmt;
5793   tree msq = NULL_TREE;
5794   gimple_seq stmts = NULL;
5795   bool compute_in_loop = false;
5796   bool nested_in_vect_loop = false;
5797   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5798   class loop *loop_for_initial_load = NULL;
5799
5800   if (loop_vinfo)
5801     {
5802       loop = LOOP_VINFO_LOOP (loop_vinfo);
5803       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5804     }
5805
5806   gcc_assert (alignment_support_scheme == dr_explicit_realign
5807               || alignment_support_scheme == dr_explicit_realign_optimized);
5808
5809   /* We need to generate three things:
5810      1. the misalignment computation
5811      2. the extra vector load (for the optimized realignment scheme).
5812      3. the phi node for the two vectors from which the realignment is
5813       done (for the optimized realignment scheme).  */
5814
5815   /* 1. Determine where to generate the misalignment computation.
5816
5817      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5818      calculation will be generated by this function, outside the loop (in the
5819      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5820      caller, inside the loop.
5821
5822      Background: If the misalignment remains fixed throughout the iterations of
5823      the loop, then both realignment schemes are applicable, and also the
5824      misalignment computation can be done outside LOOP.  This is because we are
5825      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5826      are a multiple of VS (the Vector Size), and therefore the misalignment in
5827      different vectorized LOOP iterations is always the same.
5828      The problem arises only if the memory access is in an inner-loop nested
5829      inside LOOP, which is now being vectorized using outer-loop vectorization.
5830      This is the only case when the misalignment of the memory access may not
5831      remain fixed throughout the iterations of the inner-loop (as explained in
5832      detail in vect_supportable_dr_alignment).  In this case, not only is the
5833      optimized realignment scheme not applicable, but also the misalignment
5834      computation (and generation of the realignment token that is passed to
5835      REALIGN_LOAD) have to be done inside the loop.
5836
5837      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5838      or not, which in turn determines if the misalignment is computed inside
5839      the inner-loop, or outside LOOP.  */
5840
5841   if (init_addr != NULL_TREE || !loop_vinfo)
5842     {
5843       compute_in_loop = true;
5844       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5845     }
5846
5847
5848   /* 2. Determine where to generate the extra vector load.
5849
5850      For the optimized realignment scheme, instead of generating two vector
5851      loads in each iteration, we generate a single extra vector load in the
5852      preheader of the loop, and in each iteration reuse the result of the
5853      vector load from the previous iteration.  In case the memory access is in
5854      an inner-loop nested inside LOOP, which is now being vectorized using
5855      outer-loop vectorization, we need to determine whether this initial vector
5856      load should be generated at the preheader of the inner-loop, or can be
5857      generated at the preheader of LOOP.  If the memory access has no evolution
5858      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5859      to be generated inside LOOP (in the preheader of the inner-loop).  */
5860
5861   if (nested_in_vect_loop)
5862     {
5863       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5864       bool invariant_in_outerloop =
5865             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5866       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5867     }
5868   else
5869     loop_for_initial_load = loop;
5870   if (at_loop)
5871     *at_loop = loop_for_initial_load;
5872
5873   tree vuse = NULL_TREE;
5874   if (loop_for_initial_load)
5875     {
5876       pe = loop_preheader_edge (loop_for_initial_load);
5877       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
5878         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
5879     }
5880   if (!vuse)
5881     vuse = gimple_vuse (gsi_stmt (*gsi));
5882
5883   /* 3. For the case of the optimized realignment, create the first vector
5884       load at the loop preheader.  */
5885
5886   if (alignment_support_scheme == dr_explicit_realign_optimized)
5887     {
5888       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5889       gassign *new_stmt;
5890
5891       gcc_assert (!compute_in_loop);
5892       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5893       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
5894                                       loop_for_initial_load, NULL_TREE,
5895                                       &init_addr, NULL, &inc, true);
5896       if (TREE_CODE (ptr) == SSA_NAME)
5897         new_temp = copy_ssa_name (ptr);
5898       else
5899         new_temp = make_ssa_name (TREE_TYPE (ptr));
5900       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5901       tree type = TREE_TYPE (ptr);
5902       new_stmt = gimple_build_assign
5903                    (new_temp, BIT_AND_EXPR, ptr,
5904                     fold_build2 (MINUS_EXPR, type,
5905                                  build_int_cst (type, 0),
5906                                  build_int_cst (type, align)));
5907       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5908       gcc_assert (!new_bb);
5909       data_ref
5910         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5911                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5912       vect_copy_ref_info (data_ref, DR_REF (dr));
5913       new_stmt = gimple_build_assign (vec_dest, data_ref);
5914       new_temp = make_ssa_name (vec_dest, new_stmt);
5915       gimple_assign_set_lhs (new_stmt, new_temp);
5916       gimple_set_vuse (new_stmt, vuse);
5917       if (pe)
5918         {
5919           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5920           gcc_assert (!new_bb);
5921         }
5922       else
5923          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5924
5925       msq_init = gimple_assign_lhs (new_stmt);
5926     }
5927
5928   /* 4. Create realignment token using a target builtin, if available.
5929       It is done either inside the containing loop, or before LOOP (as
5930       determined above).  */
5931
5932   if (targetm.vectorize.builtin_mask_for_load)
5933     {
5934       gcall *new_stmt;
5935       tree builtin_decl;
5936
5937       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5938       if (!init_addr)
5939         {
5940           /* Generate the INIT_ADDR computation outside LOOP.  */
5941           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
5942                                                             stmt_info, &stmts,
5943                                                             NULL_TREE);
5944           if (loop)
5945             {
5946               pe = loop_preheader_edge (loop);
5947               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5948               gcc_assert (!new_bb);
5949             }
5950           else
5951              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5952         }
5953
5954       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5955       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5956       vec_dest =
5957         vect_create_destination_var (scalar_dest,
5958                                      gimple_call_return_type (new_stmt));
5959       new_temp = make_ssa_name (vec_dest, new_stmt);
5960       gimple_call_set_lhs (new_stmt, new_temp);
5961
5962       if (compute_in_loop)
5963         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5964       else
5965         {
5966           /* Generate the misalignment computation outside LOOP.  */
5967           pe = loop_preheader_edge (loop);
5968           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5969           gcc_assert (!new_bb);
5970         }
5971
5972       *realignment_token = gimple_call_lhs (new_stmt);
5973
5974       /* The result of the CALL_EXPR to this builtin is determined from
5975          the value of the parameter and no global variables are touched
5976          which makes the builtin a "const" function.  Requiring the
5977          builtin to have the "const" attribute makes it unnecessary
5978          to call mark_call_clobbered.  */
5979       gcc_assert (TREE_READONLY (builtin_decl));
5980     }
5981
5982   if (alignment_support_scheme == dr_explicit_realign)
5983     return msq;
5984
5985   gcc_assert (!compute_in_loop);
5986   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5987
5988
5989   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5990
5991   pe = loop_preheader_edge (containing_loop);
5992   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5993   msq = make_ssa_name (vec_dest);
5994   phi_stmt = create_phi_node (msq, containing_loop->header);
5995   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5996
5997   return msq;
5998 }
5999
6000
6001 /* Function vect_grouped_load_supported.
6002
6003    COUNT is the size of the load group (the number of statements plus the
6004    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6005    only one statement, with a gap of COUNT - 1.
6006
6007    Returns true if a suitable permute exists.  */
6008
6009 bool
6010 vect_grouped_load_supported (tree vectype, bool single_element_p,
6011                              unsigned HOST_WIDE_INT count)
6012 {
6013   machine_mode mode = TYPE_MODE (vectype);
6014
6015   /* If this is single-element interleaving with an element distance
6016      that leaves unused vector loads around punt - we at least create
6017      very sub-optimal code in that case (and blow up memory,
6018      see PR65518).  */
6019   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6020     {
6021       if (dump_enabled_p ())
6022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6023                          "single-element interleaving not supported "
6024                          "for not adjacent vector loads\n");
6025       return false;
6026     }
6027
6028   /* vect_permute_load_chain requires the group size to be equal to 3 or
6029      be a power of two.  */
6030   if (count != 3 && exact_log2 (count) == -1)
6031     {
6032       if (dump_enabled_p ())
6033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6034                          "the size of the group of accesses"
6035                          " is not a power of 2 or not equal to 3\n");
6036       return false;
6037     }
6038
6039   /* Check that the permutation is supported.  */
6040   if (VECTOR_MODE_P (mode))
6041     {
6042       unsigned int i, j;
6043       if (count == 3)
6044         {
6045           unsigned int nelt;
6046           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6047             {
6048               if (dump_enabled_p ())
6049                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050                                  "cannot handle groups of 3 loads for"
6051                                  " variable-length vectors\n");
6052               return false;
6053             }
6054
6055           vec_perm_builder sel (nelt, nelt, 1);
6056           sel.quick_grow (nelt);
6057           vec_perm_indices indices;
6058           unsigned int k;
6059           for (k = 0; k < 3; k++)
6060             {
6061               for (i = 0; i < nelt; i++)
6062                 if (3 * i + k < 2 * nelt)
6063                   sel[i] = 3 * i + k;
6064                 else
6065                   sel[i] = 0;
6066               indices.new_vector (sel, 2, nelt);
6067               if (!can_vec_perm_const_p (mode, mode, indices))
6068                 {
6069                   if (dump_enabled_p ())
6070                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6071                                      "shuffle of 3 loads is not supported by"
6072                                      " target\n");
6073                   return false;
6074                 }
6075               for (i = 0, j = 0; i < nelt; i++)
6076                 if (3 * i + k < 2 * nelt)
6077                   sel[i] = i;
6078                 else
6079                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6080               indices.new_vector (sel, 2, nelt);
6081               if (!can_vec_perm_const_p (mode, mode, indices))
6082                 {
6083                   if (dump_enabled_p ())
6084                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6085                                      "shuffle of 3 loads is not supported by"
6086                                      " target\n");
6087                   return false;
6088                 }
6089             }
6090           return true;
6091         }
6092       else
6093         {
6094           /* If length is not equal to 3 then only power of 2 is supported.  */
6095           gcc_assert (pow2p_hwi (count));
6096           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6097
6098           /* The encoding has a single stepped pattern.  */
6099           vec_perm_builder sel (nelt, 1, 3);
6100           sel.quick_grow (3);
6101           for (i = 0; i < 3; i++)
6102             sel[i] = i * 2;
6103           vec_perm_indices indices (sel, 2, nelt);
6104           if (can_vec_perm_const_p (mode, mode, indices))
6105             {
6106               for (i = 0; i < 3; i++)
6107                 sel[i] = i * 2 + 1;
6108               indices.new_vector (sel, 2, nelt);
6109               if (can_vec_perm_const_p (mode, mode, indices))
6110                 return true;
6111             }
6112         }
6113     }
6114
6115   if (dump_enabled_p ())
6116     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117                      "extract even/odd not supported by target\n");
6118   return false;
6119 }
6120
6121 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6122    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6123
6124 internal_fn
6125 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6126                            bool masked_p)
6127 {
6128   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6129                                     vec_mask_len_load_lanes_optab, vectype,
6130                                     count))
6131     return IFN_MASK_LEN_LOAD_LANES;
6132   else if (masked_p)
6133     {
6134       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6135                                         vec_mask_load_lanes_optab, vectype,
6136                                         count))
6137         return IFN_MASK_LOAD_LANES;
6138     }
6139   else
6140     {
6141       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6142                                         vectype, count))
6143         return IFN_LOAD_LANES;
6144     }
6145   return IFN_LAST;
6146 }
6147
6148 /* Function vect_permute_load_chain.
6149
6150    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6151    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6152    the input data correctly.  Return the final references for loads in
6153    RESULT_CHAIN.
6154
6155    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6156    The input is 4 vectors each containing 8 elements. We assign a number to each
6157    element, the input sequence is:
6158
6159    1st vec:   0  1  2  3  4  5  6  7
6160    2nd vec:   8  9 10 11 12 13 14 15
6161    3rd vec:  16 17 18 19 20 21 22 23
6162    4th vec:  24 25 26 27 28 29 30 31
6163
6164    The output sequence should be:
6165
6166    1st vec:  0 4  8 12 16 20 24 28
6167    2nd vec:  1 5  9 13 17 21 25 29
6168    3rd vec:  2 6 10 14 18 22 26 30
6169    4th vec:  3 7 11 15 19 23 27 31
6170
6171    i.e., the first output vector should contain the first elements of each
6172    interleaving group, etc.
6173
6174    We use extract_even/odd instructions to create such output.  The input of
6175    each extract_even/odd operation is two vectors
6176    1st vec    2nd vec
6177    0 1 2 3    4 5 6 7
6178
6179    and the output is the vector of extracted even/odd elements.  The output of
6180    extract_even will be:   0 2 4 6
6181    and of extract_odd:     1 3 5 7
6182
6183
6184    The permutation is done in log LENGTH stages.  In each stage extract_even
6185    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6186    their order.  In our example,
6187
6188    E1: extract_even (1st vec, 2nd vec)
6189    E2: extract_odd (1st vec, 2nd vec)
6190    E3: extract_even (3rd vec, 4th vec)
6191    E4: extract_odd (3rd vec, 4th vec)
6192
6193    The output for the first stage will be:
6194
6195    E1:  0  2  4  6  8 10 12 14
6196    E2:  1  3  5  7  9 11 13 15
6197    E3: 16 18 20 22 24 26 28 30
6198    E4: 17 19 21 23 25 27 29 31
6199
6200    In order to proceed and create the correct sequence for the next stage (or
6201    for the correct output, if the second stage is the last one, as in our
6202    example), we first put the output of extract_even operation and then the
6203    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6204    The input for the second stage is:
6205
6206    1st vec (E1):  0  2  4  6  8 10 12 14
6207    2nd vec (E3): 16 18 20 22 24 26 28 30
6208    3rd vec (E2):  1  3  5  7  9 11 13 15
6209    4th vec (E4): 17 19 21 23 25 27 29 31
6210
6211    The output of the second stage:
6212
6213    E1: 0 4  8 12 16 20 24 28
6214    E2: 2 6 10 14 18 22 26 30
6215    E3: 1 5  9 13 17 21 25 29
6216    E4: 3 7 11 15 19 23 27 31
6217
6218    And RESULT_CHAIN after reordering:
6219
6220    1st vec (E1):  0 4  8 12 16 20 24 28
6221    2nd vec (E3):  1 5  9 13 17 21 25 29
6222    3rd vec (E2):  2 6 10 14 18 22 26 30
6223    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6224
6225 static void
6226 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6227                          unsigned int length,
6228                          stmt_vec_info stmt_info,
6229                          gimple_stmt_iterator *gsi,
6230                          vec<tree> *result_chain)
6231 {
6232   tree data_ref, first_vect, second_vect;
6233   tree perm_mask_even, perm_mask_odd;
6234   tree perm3_mask_low, perm3_mask_high;
6235   gimple *perm_stmt;
6236   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6237   unsigned int i, j, log_length = exact_log2 (length);
6238
6239   result_chain->quick_grow (length);
6240   memcpy (result_chain->address (), dr_chain.address (),
6241           length * sizeof (tree));
6242
6243   if (length == 3)
6244     {
6245       /* vect_grouped_load_supported ensures that this is constant.  */
6246       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6247       unsigned int k;
6248
6249       vec_perm_builder sel (nelt, nelt, 1);
6250       sel.quick_grow (nelt);
6251       vec_perm_indices indices;
6252       for (k = 0; k < 3; k++)
6253         {
6254           for (i = 0; i < nelt; i++)
6255             if (3 * i + k < 2 * nelt)
6256               sel[i] = 3 * i + k;
6257             else
6258               sel[i] = 0;
6259           indices.new_vector (sel, 2, nelt);
6260           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6261
6262           for (i = 0, j = 0; i < nelt; i++)
6263             if (3 * i + k < 2 * nelt)
6264               sel[i] = i;
6265             else
6266               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6267           indices.new_vector (sel, 2, nelt);
6268           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6269
6270           first_vect = dr_chain[0];
6271           second_vect = dr_chain[1];
6272
6273           /* Create interleaving stmt (low part of):
6274              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6275                                                              ...}>  */
6276           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6277           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6278                                            second_vect, perm3_mask_low);
6279           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6280
6281           /* Create interleaving stmt (high part of):
6282              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6283                                                               ...}>  */
6284           first_vect = data_ref;
6285           second_vect = dr_chain[2];
6286           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6287           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6288                                            second_vect, perm3_mask_high);
6289           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6290           (*result_chain)[k] = data_ref;
6291         }
6292     }
6293   else
6294     {
6295       /* If length is not equal to 3 then only power of 2 is supported.  */
6296       gcc_assert (pow2p_hwi (length));
6297
6298       /* The encoding has a single stepped pattern.  */
6299       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6300       vec_perm_builder sel (nelt, 1, 3);
6301       sel.quick_grow (3);
6302       for (i = 0; i < 3; ++i)
6303         sel[i] = i * 2;
6304       vec_perm_indices indices (sel, 2, nelt);
6305       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6306
6307       for (i = 0; i < 3; ++i)
6308         sel[i] = i * 2 + 1;
6309       indices.new_vector (sel, 2, nelt);
6310       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6311
6312       for (i = 0; i < log_length; i++)
6313         {
6314           for (j = 0; j < length; j += 2)
6315             {
6316               first_vect = dr_chain[j];
6317               second_vect = dr_chain[j+1];
6318
6319               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6320               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6321               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6322                                                first_vect, second_vect,
6323                                                perm_mask_even);
6324               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6325               (*result_chain)[j/2] = data_ref;
6326
6327               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6328               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6329               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6330                                                first_vect, second_vect,
6331                                                perm_mask_odd);
6332               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6333               (*result_chain)[j/2+length/2] = data_ref;
6334             }
6335           memcpy (dr_chain.address (), result_chain->address (),
6336                   length * sizeof (tree));
6337         }
6338     }
6339 }
6340
6341 /* Function vect_shift_permute_load_chain.
6342
6343    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6344    sequence of stmts to reorder the input data accordingly.
6345    Return the final references for loads in RESULT_CHAIN.
6346    Return true if successed, false otherwise.
6347
6348    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6349    The input is 3 vectors each containing 8 elements.  We assign a
6350    number to each element, the input sequence is:
6351
6352    1st vec:   0  1  2  3  4  5  6  7
6353    2nd vec:   8  9 10 11 12 13 14 15
6354    3rd vec:  16 17 18 19 20 21 22 23
6355
6356    The output sequence should be:
6357
6358    1st vec:  0 3 6  9 12 15 18 21
6359    2nd vec:  1 4 7 10 13 16 19 22
6360    3rd vec:  2 5 8 11 14 17 20 23
6361
6362    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6363
6364    First we shuffle all 3 vectors to get correct elements order:
6365
6366    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6367    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6368    3rd vec:  (16 19 22) (17 20 23) (18 21)
6369
6370    Next we unite and shift vector 3 times:
6371
6372    1st step:
6373      shift right by 6 the concatenation of:
6374      "1st vec" and  "2nd vec"
6375        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6376      "2nd vec" and  "3rd vec"
6377        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6378      "3rd vec" and  "1st vec"
6379        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6380                              | New vectors                   |
6381
6382      So that now new vectors are:
6383
6384      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6385      2nd vec:  (10 13) (16 19 22) (17 20 23)
6386      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6387
6388    2nd step:
6389      shift right by 5 the concatenation of:
6390      "1st vec" and  "3rd vec"
6391        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6392      "2nd vec" and  "1st vec"
6393        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6394      "3rd vec" and  "2nd vec"
6395        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6396                           | New vectors                   |
6397
6398      So that now new vectors are:
6399
6400      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6401      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6402      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6403
6404    3rd step:
6405      shift right by 5 the concatenation of:
6406      "1st vec" and  "1st vec"
6407        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6408      shift right by 3 the concatenation of:
6409      "2nd vec" and  "2nd vec"
6410                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6411                           | New vectors                   |
6412
6413      So that now all vectors are READY:
6414      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6415      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6416      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6417
6418    This algorithm is faster than one in vect_permute_load_chain if:
6419      1.  "shift of a concatination" is faster than general permutation.
6420          This is usually so.
6421      2.  The TARGET machine can't execute vector instructions in parallel.
6422          This is because each step of the algorithm depends on previous.
6423          The algorithm in vect_permute_load_chain is much more parallel.
6424
6425    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6426 */
6427
6428 static bool
6429 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6430                                unsigned int length,
6431                                stmt_vec_info stmt_info,
6432                                gimple_stmt_iterator *gsi,
6433                                vec<tree> *result_chain)
6434 {
6435   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6436   tree perm2_mask1, perm2_mask2, perm3_mask;
6437   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6438   gimple *perm_stmt;
6439
6440   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6441   machine_mode vmode = TYPE_MODE (vectype);
6442   unsigned int i;
6443   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6444
6445   unsigned HOST_WIDE_INT nelt, vf;
6446   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6447       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6448     /* Not supported for variable-length vectors.  */
6449     return false;
6450
6451   vec_perm_builder sel (nelt, nelt, 1);
6452   sel.quick_grow (nelt);
6453
6454   result_chain->quick_grow (length);
6455   memcpy (result_chain->address (), dr_chain.address (),
6456           length * sizeof (tree));
6457
6458   if (pow2p_hwi (length) && vf > 4)
6459     {
6460       unsigned int j, log_length = exact_log2 (length);
6461       for (i = 0; i < nelt / 2; ++i)
6462         sel[i] = i * 2;
6463       for (i = 0; i < nelt / 2; ++i)
6464         sel[nelt / 2 + i] = i * 2 + 1;
6465       vec_perm_indices indices (sel, 2, nelt);
6466       if (!can_vec_perm_const_p (vmode, vmode, indices))
6467         {
6468           if (dump_enabled_p ())
6469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470                              "shuffle of 2 fields structure is not \
6471                               supported by target\n");
6472           return false;
6473         }
6474       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6475
6476       for (i = 0; i < nelt / 2; ++i)
6477         sel[i] = i * 2 + 1;
6478       for (i = 0; i < nelt / 2; ++i)
6479         sel[nelt / 2 + i] = i * 2;
6480       indices.new_vector (sel, 2, nelt);
6481       if (!can_vec_perm_const_p (vmode, vmode, indices))
6482         {
6483           if (dump_enabled_p ())
6484             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6485                              "shuffle of 2 fields structure is not \
6486                               supported by target\n");
6487           return false;
6488         }
6489       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6490
6491       /* Generating permutation constant to shift all elements.
6492          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6493       for (i = 0; i < nelt; i++)
6494         sel[i] = nelt / 2 + i;
6495       indices.new_vector (sel, 2, nelt);
6496       if (!can_vec_perm_const_p (vmode, vmode, indices))
6497         {
6498           if (dump_enabled_p ())
6499             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6500                              "shift permutation is not supported by target\n");
6501           return false;
6502         }
6503       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6504
6505       /* Generating permutation constant to select vector from 2.
6506          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6507       for (i = 0; i < nelt / 2; i++)
6508         sel[i] = i;
6509       for (i = nelt / 2; i < nelt; i++)
6510         sel[i] = nelt + i;
6511       indices.new_vector (sel, 2, nelt);
6512       if (!can_vec_perm_const_p (vmode, vmode, indices))
6513         {
6514           if (dump_enabled_p ())
6515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6516                              "select is not supported by target\n");
6517           return false;
6518         }
6519       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6520
6521       for (i = 0; i < log_length; i++)
6522         {
6523           for (j = 0; j < length; j += 2)
6524             {
6525               first_vect = dr_chain[j];
6526               second_vect = dr_chain[j + 1];
6527
6528               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6529               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6530                                                first_vect, first_vect,
6531                                                perm2_mask1);
6532               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6533               vect[0] = data_ref;
6534
6535               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6536               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6537                                                second_vect, second_vect,
6538                                                perm2_mask2);
6539               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6540               vect[1] = data_ref;
6541
6542               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6543               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6544                                                vect[0], vect[1], shift1_mask);
6545               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6546               (*result_chain)[j/2 + length/2] = data_ref;
6547
6548               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6549               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6550                                                vect[0], vect[1], select_mask);
6551               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6552               (*result_chain)[j/2] = data_ref;
6553             }
6554           memcpy (dr_chain.address (), result_chain->address (),
6555                   length * sizeof (tree));
6556         }
6557       return true;
6558     }
6559   if (length == 3 && vf > 2)
6560     {
6561       unsigned int k = 0, l = 0;
6562
6563       /* Generating permutation constant to get all elements in rigth order.
6564          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6565       for (i = 0; i < nelt; i++)
6566         {
6567           if (3 * k + (l % 3) >= nelt)
6568             {
6569               k = 0;
6570               l += (3 - (nelt % 3));
6571             }
6572           sel[i] = 3 * k + (l % 3);
6573           k++;
6574         }
6575       vec_perm_indices indices (sel, 2, nelt);
6576       if (!can_vec_perm_const_p (vmode, vmode, indices))
6577         {
6578           if (dump_enabled_p ())
6579             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6580                              "shuffle of 3 fields structure is not \
6581                               supported by target\n");
6582           return false;
6583         }
6584       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6585
6586       /* Generating permutation constant to shift all elements.
6587          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6588       for (i = 0; i < nelt; i++)
6589         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6590       indices.new_vector (sel, 2, nelt);
6591       if (!can_vec_perm_const_p (vmode, vmode, indices))
6592         {
6593           if (dump_enabled_p ())
6594             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595                              "shift permutation is not supported by target\n");
6596           return false;
6597         }
6598       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6599
6600       /* Generating permutation constant to shift all elements.
6601          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6602       for (i = 0; i < nelt; i++)
6603         sel[i] = 2 * (nelt / 3) + 1 + i;
6604       indices.new_vector (sel, 2, nelt);
6605       if (!can_vec_perm_const_p (vmode, vmode, indices))
6606         {
6607           if (dump_enabled_p ())
6608             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609                              "shift permutation is not supported by target\n");
6610           return false;
6611         }
6612       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6613
6614       /* Generating permutation constant to shift all elements.
6615          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6616       for (i = 0; i < nelt; i++)
6617         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6618       indices.new_vector (sel, 2, nelt);
6619       if (!can_vec_perm_const_p (vmode, vmode, indices))
6620         {
6621           if (dump_enabled_p ())
6622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6623                              "shift permutation is not supported by target\n");
6624           return false;
6625         }
6626       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6627
6628       /* Generating permutation constant to shift all elements.
6629          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6630       for (i = 0; i < nelt; i++)
6631         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6632       indices.new_vector (sel, 2, nelt);
6633       if (!can_vec_perm_const_p (vmode, vmode, indices))
6634         {
6635           if (dump_enabled_p ())
6636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6637                              "shift permutation is not supported by target\n");
6638           return false;
6639         }
6640       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6641
6642       for (k = 0; k < 3; k++)
6643         {
6644           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6645           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6646                                            dr_chain[k], dr_chain[k],
6647                                            perm3_mask);
6648           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6649           vect[k] = data_ref;
6650         }
6651
6652       for (k = 0; k < 3; k++)
6653         {
6654           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6655           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6656                                            vect[k % 3], vect[(k + 1) % 3],
6657                                            shift1_mask);
6658           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6659           vect_shift[k] = data_ref;
6660         }
6661
6662       for (k = 0; k < 3; k++)
6663         {
6664           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6665           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6666                                            vect_shift[(4 - k) % 3],
6667                                            vect_shift[(3 - k) % 3],
6668                                            shift2_mask);
6669           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6670           vect[k] = data_ref;
6671         }
6672
6673       (*result_chain)[3 - (nelt % 3)] = vect[2];
6674
6675       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6676       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6677                                        vect[0], shift3_mask);
6678       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6679       (*result_chain)[nelt % 3] = data_ref;
6680
6681       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6682       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6683                                        vect[1], shift4_mask);
6684       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6685       (*result_chain)[0] = data_ref;
6686       return true;
6687     }
6688   return false;
6689 }
6690
6691 /* Function vect_transform_grouped_load.
6692
6693    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6694    to perform their permutation and ascribe the result vectorized statements to
6695    the scalar statements.
6696 */
6697
6698 void
6699 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6700                              vec<tree> dr_chain,
6701                              int size, gimple_stmt_iterator *gsi)
6702 {
6703   machine_mode mode;
6704   vec<tree> result_chain = vNULL;
6705
6706   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6707      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6708      vectors, that are ready for vector computation.  */
6709   result_chain.create (size);
6710
6711   /* If reassociation width for vector type is 2 or greater target machine can
6712      execute 2 or more vector instructions in parallel.  Otherwise try to
6713      get chain for loads group using vect_shift_permute_load_chain.  */
6714   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6715   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6716       || pow2p_hwi (size)
6717       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6718                                          gsi, &result_chain))
6719     vect_permute_load_chain (vinfo, dr_chain,
6720                              size, stmt_info, gsi, &result_chain);
6721   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6722   result_chain.release ();
6723 }
6724
6725 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6726    generated as part of the vectorization of STMT_INFO.  Assign the statement
6727    for each vector to the associated scalar statement.  */
6728
6729 void
6730 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6731                                   vec<tree> result_chain)
6732 {
6733   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6734   unsigned int i, gap_count;
6735   tree tmp_data_ref;
6736
6737   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6738      Since we scan the chain starting from it's first node, their order
6739      corresponds the order of data-refs in RESULT_CHAIN.  */
6740   stmt_vec_info next_stmt_info = first_stmt_info;
6741   gap_count = 1;
6742   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6743     {
6744       if (!next_stmt_info)
6745         break;
6746
6747       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6748        code elimination pass later.  No need to check for the first stmt in
6749        the group, since it always exists.
6750        DR_GROUP_GAP is the number of steps in elements from the previous
6751        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6752        correspond to the gaps.  */
6753       if (next_stmt_info != first_stmt_info
6754           && gap_count < DR_GROUP_GAP (next_stmt_info))
6755         {
6756           gap_count++;
6757           continue;
6758         }
6759
6760       /* ???  The following needs cleanup after the removal of
6761          DR_GROUP_SAME_DR_STMT.  */
6762       if (next_stmt_info)
6763         {
6764           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6765           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6766              copies, and we put the new vector statement last.  */
6767           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6768
6769           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6770           gap_count = 1;
6771         }
6772     }
6773 }
6774
6775 /* Function vect_force_dr_alignment_p.
6776
6777    Returns whether the alignment of a DECL can be forced to be aligned
6778    on ALIGNMENT bit boundary.  */
6779
6780 bool
6781 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6782 {
6783   if (!VAR_P (decl))
6784     return false;
6785
6786   if (decl_in_symtab_p (decl)
6787       && !symtab_node::get (decl)->can_increase_alignment_p ())
6788     return false;
6789
6790   if (TREE_STATIC (decl))
6791     return (known_le (alignment,
6792                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6793   else
6794     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6795 }
6796
6797 /* Return whether the data reference DR_INFO is supported with respect to its
6798    alignment.
6799    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6800    it is aligned, i.e., check if it is possible to vectorize it with different
6801    alignment.  */
6802
6803 enum dr_alignment_support
6804 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6805                                tree vectype, int misalignment)
6806 {
6807   data_reference *dr = dr_info->dr;
6808   stmt_vec_info stmt_info = dr_info->stmt;
6809   machine_mode mode = TYPE_MODE (vectype);
6810   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6811   class loop *vect_loop = NULL;
6812   bool nested_in_vect_loop = false;
6813
6814   if (misalignment == 0)
6815     return dr_aligned;
6816
6817   /* For now assume all conditional loads/stores support unaligned
6818      access without any special code.  */
6819   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6820     if (gimple_call_internal_p (stmt)
6821         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6822             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6823       return dr_unaligned_supported;
6824
6825   if (loop_vinfo)
6826     {
6827       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6828       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6829     }
6830
6831   /* Possibly unaligned access.  */
6832
6833   /* We can choose between using the implicit realignment scheme (generating
6834      a misaligned_move stmt) and the explicit realignment scheme (generating
6835      aligned loads with a REALIGN_LOAD).  There are two variants to the
6836      explicit realignment scheme: optimized, and unoptimized.
6837      We can optimize the realignment only if the step between consecutive
6838      vector loads is equal to the vector size.  Since the vector memory
6839      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6840      is guaranteed that the misalignment amount remains the same throughout the
6841      execution of the vectorized loop.  Therefore, we can create the
6842      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6843      at the loop preheader.
6844
6845      However, in the case of outer-loop vectorization, when vectorizing a
6846      memory access in the inner-loop nested within the LOOP that is now being
6847      vectorized, while it is guaranteed that the misalignment of the
6848      vectorized memory access will remain the same in different outer-loop
6849      iterations, it is *not* guaranteed that is will remain the same throughout
6850      the execution of the inner-loop.  This is because the inner-loop advances
6851      with the original scalar step (and not in steps of VS).  If the inner-loop
6852      step happens to be a multiple of VS, then the misalignment remains fixed
6853      and we can use the optimized realignment scheme.  For example:
6854
6855       for (i=0; i<N; i++)
6856         for (j=0; j<M; j++)
6857           s += a[i+j];
6858
6859      When vectorizing the i-loop in the above example, the step between
6860      consecutive vector loads is 1, and so the misalignment does not remain
6861      fixed across the execution of the inner-loop, and the realignment cannot
6862      be optimized (as illustrated in the following pseudo vectorized loop):
6863
6864       for (i=0; i<N; i+=4)
6865         for (j=0; j<M; j++){
6866           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6867                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6868                          // (assuming that we start from an aligned address).
6869           }
6870
6871      We therefore have to use the unoptimized realignment scheme:
6872
6873       for (i=0; i<N; i+=4)
6874           for (j=k; j<M; j+=4)
6875           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6876                            // that the misalignment of the initial address is
6877                            // 0).
6878
6879      The loop can then be vectorized as follows:
6880
6881       for (k=0; k<4; k++){
6882         rt = get_realignment_token (&vp[k]);
6883         for (i=0; i<N; i+=4){
6884           v1 = vp[i+k];
6885           for (j=k; j<M; j+=4){
6886             v2 = vp[i+j+VS-1];
6887             va = REALIGN_LOAD <v1,v2,rt>;
6888             vs += va;
6889             v1 = v2;
6890           }
6891         }
6892     } */
6893
6894   if (DR_IS_READ (dr))
6895     {
6896       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6897           && (!targetm.vectorize.builtin_mask_for_load
6898               || targetm.vectorize.builtin_mask_for_load ()))
6899         {
6900           /* If we are doing SLP then the accesses need not have the
6901              same alignment, instead it depends on the SLP group size.  */
6902           if (loop_vinfo
6903               && STMT_SLP_TYPE (stmt_info)
6904               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
6905                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6906                                   * (DR_GROUP_SIZE
6907                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6908                                   TYPE_VECTOR_SUBPARTS (vectype))))
6909             ;
6910           else if (!loop_vinfo
6911                    || (nested_in_vect_loop
6912                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6913                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6914             return dr_explicit_realign;
6915           else
6916             return dr_explicit_realign_optimized;
6917         }
6918     }
6919
6920   bool is_packed = false;
6921   tree type = TREE_TYPE (DR_REF (dr));
6922   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
6923     is_packed = not_size_aligned (DR_REF (dr));
6924   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
6925                                                      is_packed))
6926     return dr_unaligned_supported;
6927
6928   /* Unsupported.  */
6929   return dr_unaligned_unsupported;
6930 }