gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 286      emitted at the position of the first scalar load.
 287      Stores in a group are emitted at the position of the last scalar store.
 288      Compute that position and check whether the resulting order matches
 289      the current one.  */
 290   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 291   if (il_a)
 292     {
 293       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 294         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 295              s = DR_GROUP_NEXT_ELEMENT (s))
 296           il_a = get_later_stmt (il_a, s);
 297       else /* DR_IS_READ */
 298         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 299              s = DR_GROUP_NEXT_ELEMENT (s))
 300           if (get_later_stmt (il_a, s) == il_a)
 301             il_a = s;
 302     }
 303   else
 304     il_a = stmtinfo_a;
 305   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 306   if (il_b)
 307     {
 308       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 309         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 310              s = DR_GROUP_NEXT_ELEMENT (s))
 311           il_b = get_later_stmt (il_b, s);
 312       else /* DR_IS_READ */
 313         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 314              s = DR_GROUP_NEXT_ELEMENT (s))
 315           if (get_later_stmt (il_b, s) == il_b)
 316             il_b = s;
 317     }
 318   else
 319     il_b = stmtinfo_b;
 320   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 321   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 322 }
 323
 324 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 325    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 326    distances.  These distances are conservatively correct but they don't
 327    reflect a guaranteed dependence.
 328
 329    Return true if this function does all the work necessary to avoid
 330    an alias or false if the caller should use the dependence distances
 331    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 332    the depth of the loop described by LOOP_VINFO and the other arguments
 333    are as for vect_analyze_data_ref_dependence.  */
 334
 335 static bool
 336 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 337                                        loop_vec_info loop_vinfo,
 338                                        int loop_depth, unsigned int *max_vf)
 339 {
 340   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 341   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 342     {
 343       int dist = dist_v[loop_depth];
 344       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 345         {
 346           /* If the user asserted safelen >= DIST consecutive iterations
 347              can be executed concurrently, assume independence.
 348
 349              ??? An alternative would be to add the alias check even
 350              in this case, and vectorize the fallback loop with the
 351              maximum VF set to safelen.  However, if the user has
 352              explicitly given a length, it's less likely that that
 353              would be a win.  */
 354           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 355             {
 356               if ((unsigned int) loop->safelen < *max_vf)
 357                 *max_vf = loop->safelen;
 358               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 359               continue;
 360             }
 361
 362           /* For dependence distances of 2 or more, we have the option
 363              of limiting VF or checking for an alias at runtime.
 364              Prefer to check at runtime if we can, to avoid limiting
 365              the VF unnecessarily when the bases are in fact independent.
 366
 367              Note that the alias checks will be removed if the VF ends up
 368              being small enough.  */
 369           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 370           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 371           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 372                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 373                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 374         }
 375     }
 376   return true;
 377 }
 378
 379
 380 /* Function vect_analyze_data_ref_dependence.
 381
 382    FIXME: I needed to change the sense of the returned flag.
 383
 384    Return FALSE if there (might) exist a dependence between a memory-reference
 385    DRA and a memory-reference DRB.  When versioning for alias may check a
 386    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 387    the data dependence.  */
 388
 389 static opt_result
 390 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 391                                   loop_vec_info loop_vinfo,
 392                                   unsigned int *max_vf)
 393 {
 394   unsigned int i;
 395   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 396   struct data_reference *dra = DDR_A (ddr);
 397   struct data_reference *drb = DDR_B (ddr);
 398   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 399   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 400   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 401   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 402   lambda_vector dist_v;
 403   unsigned int loop_depth;
 404
 405   /* If user asserted safelen consecutive iterations can be
 406      executed concurrently, assume independence.  */
 407   auto apply_safelen = [&]()
 408     {
 409       if (loop->safelen >= 2)
 410         {
 411           if ((unsigned int) loop->safelen < *max_vf)
 412             *max_vf = loop->safelen;
 413           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 414           return true;
 415         }
 416       return false;
 417     };
 418
 419   /* In loop analysis all data references should be vectorizable.  */
 420   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 421       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 422     gcc_unreachable ();
 423
 424   /* Independent data accesses.  */
 425   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 426     return opt_result::success ();
 427
 428   if (dra == drb
 429       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 430     return opt_result::success ();
 431
 432   /* We do not have to consider dependences between accesses that belong
 433      to the same group, unless the stride could be smaller than the
 434      group size.  */
 435   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 436       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 437           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 438       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 439     return opt_result::success ();
 440
 441   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 442      least two scalar iterations, there is always also a true dependence.
 443      As the vectorizer does not re-order loads and stores we can ignore
 444      the anti-dependence if TBAA can disambiguate both DRs similar to the
 445      case with known negative distance anti-dependences (positive
 446      distance anti-dependences would violate TBAA constraints).  */
 447   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 448        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 449       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 450                                  get_alias_set (DR_REF (drb))))
 451     return opt_result::success ();
 452
 453   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455     {
 456       if (apply_safelen ())
 457         return opt_result::success ();
 458
 459       return opt_result::failure_at
 460         (stmtinfo_a->stmt,
 461          "possible alias involving gather/scatter between %T and %T\n",
 462          DR_REF (dra), DR_REF (drb));
 463     }
 464
 465   /* Unknown data dependence.  */
 466   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 467     {
 468       if (apply_safelen ())
 469         return opt_result::success ();
 470
 471       if (dump_enabled_p ())
 472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 473                          "versioning for alias required: "
 474                          "can't determine dependence between %T and %T\n",
 475                          DR_REF (dra), DR_REF (drb));
 476
 477       /* Add to list of ddrs that need to be tested at run-time.  */
 478       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 479     }
 480
 481   /* Known data dependence.  */
 482   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 483     {
 484       if (apply_safelen ())
 485         return opt_result::success ();
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 489                          "versioning for alias required: "
 490                          "bad dist vector for %T and %T\n",
 491                          DR_REF (dra), DR_REF (drb));
 492       /* Add to list of ddrs that need to be tested at run-time.  */
 493       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 494     }
 495
 496   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 497
 498   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 499       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 500                                                 loop_depth, max_vf))
 501     return opt_result::success ();
 502
 503   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 504     {
 505       int dist = dist_v[loop_depth];
 506
 507       if (dump_enabled_p ())
 508         dump_printf_loc (MSG_NOTE, vect_location,
 509                          "dependence distance  = %d.\n", dist);
 510
 511       if (dist == 0)
 512         {
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance == 0 between %T and %T\n",
 516                              DR_REF (dra), DR_REF (drb));
 517
 518           /* When we perform grouped accesses and perform implicit CSE
 519              by detecting equal accesses and doing disambiguation with
 520              runtime alias tests like for
 521                 .. = a[i];
 522                 .. = a[i+1];
 523                 a[i] = ..;
 524                 a[i+1] = ..;
 525                 *p = ..;
 526                 .. = a[i];
 527                 .. = a[i+1];
 528              where we will end up loading { a[i], a[i+1] } once, make
 529              sure that inserting group loads before the first load and
 530              stores after the last store will do the right thing.
 531              Similar for groups like
 532                 a[i] = ...;
 533                 ... = a[i];
 534                 a[i+1] = ...;
 535              where loads from the group interleave with the store.  */
 536           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 537             return opt_result::failure_at (stmtinfo_a->stmt,
 538                                            "READ_WRITE dependence"
 539                                            " in interleaving.\n");
 540
 541           if (loop->safelen < 2)
 542             {
 543               tree indicator = dr_zero_step_indicator (dra);
 544               if (!indicator || integer_zerop (indicator))
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "access also has a zero step\n");
 547               else if (TREE_CODE (indicator) != INTEGER_CST)
 548                 vect_check_nonzero_value (loop_vinfo, indicator);
 549             }
 550           continue;
 551         }
 552
 553       if (dist > 0 && DDR_REVERSED_P (ddr))
 554         {
 555           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 556              reversed (to make distance vector positive), and the actual
 557              distance is negative.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance negative.\n");
 561           /* When doing outer loop vectorization, we need to check if there is
 562              a backward dependence at the inner loop level if the dependence
 563              at the outer loop is reversed.  See PR81740.  */
 564           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 565               || nested_in_vect_loop_p (loop, stmtinfo_b))
 566             {
 567               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 568                                                          DDR_LOOP_NEST (ddr));
 569               if (dist_v[inner_depth] < 0)
 570                 return opt_result::failure_at (stmtinfo_a->stmt,
 571                                                "not vectorized, dependence "
 572                                                "between data-refs %T and %T\n",
 573                                                DR_REF (dra), DR_REF (drb));
 574             }
 575           /* Record a negative dependence distance to later limit the
 576              amount of stmt copying / unrolling we can perform.
 577              Only need to handle read-after-write dependence.  */
 578           if (DR_IS_READ (drb)
 579               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 580                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 581             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 582           continue;
 583         }
 584
 585       unsigned int abs_dist = abs (dist);
 586       if (abs_dist >= 2 && abs_dist < *max_vf)
 587         {
 588           /* The dependence distance requires reduction of the maximal
 589              vectorization factor.  */
 590           *max_vf = abs_dist;
 591           if (dump_enabled_p ())
 592             dump_printf_loc (MSG_NOTE, vect_location,
 593                              "adjusting maximal vectorization factor to %i\n",
 594                              *max_vf);
 595         }
 596
 597       if (abs_dist >= *max_vf)
 598         {
 599           /* Dependence distance does not create dependence, as far as
 600              vectorization is concerned, in this case.  */
 601           if (dump_enabled_p ())
 602             dump_printf_loc (MSG_NOTE, vect_location,
 603                              "dependence distance >= VF.\n");
 604           continue;
 605         }
 606
 607       return opt_result::failure_at (stmtinfo_a->stmt,
 608                                      "not vectorized, possible dependence "
 609                                      "between data-refs %T and %T\n",
 610                                      DR_REF (dra), DR_REF (drb));
 611     }
 612
 613   return opt_result::success ();
 614 }
 615
 616 /* Funcion vect_analyze_early_break_dependences.
 617
 618    Examime all the data references in the loop and make sure that if we have
 619    mulitple exits that we are able to safely move stores such that they become
 620    safe for vectorization.  The function also calculates the place where to move
 621    the instructions to and computes what the new vUSE chain should be.
 622
 623    This works in tandem with the CFG that will be produced by
 624    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 625
 626    This function tries to validate whether an early break vectorization
 627    is possible for the current instruction sequence. Returns True i
 628    possible, otherwise False.
 629
 630    Requirements:
 631      - Any memory access must be to a fixed size buffer.
 632      - There must not be any loads and stores to the same object.
 633      - Multiple loads are allowed as long as they don't alias.
 634
 635    NOTE:
 636      This implemementation is very conservative. Any overlappig loads/stores
 637      that take place before the early break statement gets rejected aside from
 638      WAR dependencies.
 639
 640      i.e.:
 641
 642         a[i] = 8
 643         c = a[i]
 644         if (b[i])
 645           ...
 646
 647         is not allowed, but
 648
 649         c = a[i]
 650         a[i] = 8
 651         if (b[i])
 652           ...
 653
 654         is which is the common case.  */
 655
 656 static opt_result
 657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 658 {
 659   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 660
 661   /* List of all load data references found during traversal.  */
 662   auto_vec<data_reference *> bases;
 663   basic_block dest_bb = NULL;
 664
 665   hash_set <gimple *> visited;
 666   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 667   class loop *loop_nest = loop_outer (loop);
 668
 669   if (dump_enabled_p ())
 670     dump_printf_loc (MSG_NOTE, vect_location,
 671                      "loop contains multiple exits, analyzing"
 672                      " statement dependencies.\n");
 673
 674   /* Since we don't support general control flow, the location we'll move the
 675      side-effects to is always the latch connected exit.  When we support
 676      general control flow we can do better but for now this is fine.  */
 677   dest_bb = single_pred (loop->latch);
 678   basic_block bb = dest_bb;
 679
 680   do
 681     {
 682       /* If the destination block is also the header then we have nothing to do.  */
 683       if (!single_pred_p (bb))
 684         continue;
 685
 686       bb = single_pred (bb);
 687       gimple_stmt_iterator gsi = gsi_last_bb (bb);
 688
 689       /* Now analyze all the remaining statements and try to determine which
 690          instructions are allowed/needed to be moved.  */
 691       while (!gsi_end_p (gsi))
 692         {
 693           gimple *stmt = gsi_stmt (gsi);
 694           gsi_prev (&gsi);
 695           if (!gimple_has_ops (stmt)
 696               || is_gimple_debug (stmt))
 697             continue;
 698
 699           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 700           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 701           if (!dr_ref)
 702             continue;
 703
 704           /* We currently only support statically allocated objects due to
 705              not having first-faulting loads support or peeling for
 706              alignment support.  Compute the size of the referenced object
 707              (it could be dynamically allocated).  */
 708           tree obj = DR_BASE_ADDRESS (dr_ref);
 709           if (!obj || TREE_CODE (obj) != ADDR_EXPR)
 710             {
 711               if (dump_enabled_p ())
 712                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 713                                  "early breaks only supported on statically"
 714                                  " allocated objects.\n");
 715               return opt_result::failure_at (stmt,
 716                                  "can't safely apply code motion to "
 717                                  "dependencies of %G to vectorize "
 718                                  "the early exit.\n", stmt);
 719             }
 720
 721           tree refop = TREE_OPERAND (obj, 0);
 722           tree refbase = get_base_address (refop);
 723           if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
 724               || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
 725             {
 726               if (dump_enabled_p ())
 727                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 728                                  "early breaks only supported on"
 729                                  " statically allocated objects.\n");
 730               return opt_result::failure_at (stmt,
 731                                  "can't safely apply code motion to "
 732                                  "dependencies of %G to vectorize "
 733                                  "the early exit.\n", stmt);
 734             }
 735
 736           /* Check if vector accesses to the object will be within bounds.
 737              must be a constant or assume loop will be versioned or niters
 738              bounded by VF so accesses are within range.  */
 739           if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
 740             {
 741               if (dump_enabled_p ())
 742                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 743                                  "early breaks not supported: vectorization "
 744                                  "would %s beyond size of obj.",
 745                                  DR_IS_READ (dr_ref) ? "read" : "write");
 746               return opt_result::failure_at (stmt,
 747                                  "can't safely apply code motion to "
 748                                  "dependencies of %G to vectorize "
 749                                  "the early exit.\n", stmt);
 750             }
 751
 752           if (DR_IS_READ (dr_ref))
 753             bases.safe_push (dr_ref);
 754           else if (DR_IS_WRITE (dr_ref))
 755             {
 756               /* We are moving writes down in the CFG.  To be sure that this
 757                  is valid after vectorization we have to check all the loads
 758                  we are sinking the stores past to see if any of them may
 759                  alias or are the same object.
 760
 761                  Same objects will not be an issue because unless the store
 762                  is marked volatile the value can be forwarded.  If the
 763                  store is marked volatile we don't vectorize the loop
 764                  anyway.
 765
 766                  That leaves the check for aliasing.  We don't really need
 767                  to care about the stores aliasing with each other since the
 768                  stores are moved in order so the effects are still observed
 769                  correctly.  This leaves the check for WAR dependencies
 770                  which we would be introducing here if the DR can alias.
 771                  The check is quadratic in loads/stores but I have not found
 772                  a better API to do this.  I believe all loads and stores
 773                  must be checked.  We also must check them when we
 774                  encountered the store, since we don't care about loads past
 775                  the store.  */
 776
 777               for (auto dr_read : bases)
 778                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 779                   {
 780                     if (dump_enabled_p ())
 781                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 782                                        vect_location,
 783                                        "early breaks not supported: "
 784                                        "overlapping loads and stores "
 785                                        "found before the break "
 786                                        "statement.\n");
 787
 788                     return opt_result::failure_at (stmt,
 789                              "can't safely apply code motion to dependencies"
 790                              " to vectorize the early exit. %G may alias with"
 791                              " %G\n", stmt, dr_read->stmt);
 792                   }
 793             }
 794
 795           if (gimple_vdef (stmt))
 796             {
 797               if (dump_enabled_p ())
 798                 dump_printf_loc (MSG_NOTE, vect_location,
 799                                  "==> recording stmt %G", stmt);
 800
 801               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 802             }
 803           else if (gimple_vuse (stmt))
 804             {
 805               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 806               if (dump_enabled_p ())
 807                 dump_printf_loc (MSG_NOTE, vect_location,
 808                                  "marked statement for vUSE update: %G", stmt);
 809             }
 810         }
 811     }
 812   while (bb != loop->header);
 813
 814   /* We don't allow outer -> inner loop transitions which should have been
 815      trapped already during loop form analysis.  */
 816   gcc_assert (dest_bb->loop_father == loop);
 817
 818   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 819
 820   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 821     {
 822       /* All uses shall be updated to that of the first load.  Entries are
 823          stored in reverse order.  */
 824       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 825       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 826         {
 827           if (dump_enabled_p ())
 828           dump_printf_loc (MSG_NOTE, vect_location,
 829                            "will update use: %T, mem_ref: %G", vuse, g);
 830         }
 831     }
 832
 833   if (dump_enabled_p ())
 834     dump_printf_loc (MSG_NOTE, vect_location,
 835                      "recorded statements to be moved to BB %d\n",
 836                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 837
 838   return opt_result::success ();
 839 }
 840
 841 /* Function vect_analyze_data_ref_dependences.
 842
 843    Examine all the data references in the loop, and make sure there do not
 844    exist any data dependences between them.  Set *MAX_VF according to
 845    the maximum vectorization factor the data dependences allow.  */
 846
 847 opt_result
 848 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 849                                    unsigned int *max_vf)
 850 {
 851   unsigned int i;
 852   struct data_dependence_relation *ddr;
 853
 854   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 855
 856   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 857     {
 858       LOOP_VINFO_DDRS (loop_vinfo)
 859         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 860                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 861       /* We do not need read-read dependences.  */
 862       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 863                                           &LOOP_VINFO_DDRS (loop_vinfo),
 864                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 865                                           false);
 866       gcc_assert (res);
 867     }
 868
 869   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 870
 871   /* For epilogues we either have no aliases or alias versioning
 872      was applied to original loop.  Therefore we may just get max_vf
 873      using VF of original loop.  */
 874   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 875     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 876   else
 877     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 878       {
 879         opt_result res
 880           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 881         if (!res)
 882           return res;
 883       }
 884
 885   /* If we have early break statements in the loop, check to see if they
 886      are of a form we can vectorizer.  */
 887   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 888     return vect_analyze_early_break_dependences (loop_vinfo);
 889
 890   return opt_result::success ();
 891 }
 892
 893
 894 /* Function vect_slp_analyze_data_ref_dependence.
 895
 896    Return TRUE if there (might) exist a dependence between a memory-reference
 897    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 898    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 899    according to the data dependence.  */
 900
 901 static bool
 902 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 903                                       struct data_dependence_relation *ddr)
 904 {
 905   struct data_reference *dra = DDR_A (ddr);
 906   struct data_reference *drb = DDR_B (ddr);
 907   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 908   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 909
 910   /* We need to check dependences of statements marked as unvectorizable
 911      as well, they still can prohibit vectorization.  */
 912
 913   /* Independent data accesses.  */
 914   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 915     return false;
 916
 917   if (dra == drb)
 918     return false;
 919
 920   /* Read-read is OK.  */
 921   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 922     return false;
 923
 924   /* If dra and drb are part of the same interleaving chain consider
 925      them independent.  */
 926   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 927       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 928           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 929     return false;
 930
 931   /* Unknown data dependence.  */
 932   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 933     {
 934       if  (dump_enabled_p ())
 935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 936                          "can't determine dependence between %T and %T\n",
 937                          DR_REF (dra), DR_REF (drb));
 938     }
 939   else if (dump_enabled_p ())
 940     dump_printf_loc (MSG_NOTE, vect_location,
 941                      "determined dependence between %T and %T\n",
 942                      DR_REF (dra), DR_REF (drb));
 943
 944   return true;
 945 }
 946
 947
 948 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 949
 950 static bool
 951 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 952 {
 953   /* This walks over all stmts involved in the SLP store done
 954      in NODE verifying we can sink them up to the last stmt in the
 955      group.  */
 956   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 957   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 958
 959   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 960     {
 961       stmt_vec_info access_info
 962         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 963       if (access_info == last_access_info)
 964         continue;
 965       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 966       ao_ref ref;
 967       bool ref_initialized_p = false;
 968       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 969            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 970         {
 971           gimple *stmt = gsi_stmt (gsi);
 972           if (! gimple_vuse (stmt))
 973             continue;
 974
 975           /* If we couldn't record a (single) data reference for this
 976              stmt we have to resort to the alias oracle.  */
 977           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 978           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 979           if (!dr_b)
 980             {
 981               /* We are moving a store - this means
 982                  we cannot use TBAA for disambiguation.  */
 983               if (!ref_initialized_p)
 984                 ao_ref_init (&ref, DR_REF (dr_a));
 985               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 986                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 987                 return false;
 988               continue;
 989             }
 990
 991           gcc_assert (!gimple_visited_p (stmt));
 992
 993           ddr_p ddr = initialize_data_dependence_relation (dr_a,
 994                                                            dr_b, vNULL);
 995           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 996           free_dependence_relation (ddr);
 997           if (dependent)
 998             return false;
 999         }
1000     }
1001   return true;
1002 }
1003
1004 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1005    contain the vector of scalar stores of this instance if we are
1006    disambiguating the loads.  */
1007
1008 static bool
1009 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1010                                    vec<stmt_vec_info> stores,
1011                                    stmt_vec_info last_store_info)
1012 {
1013   /* This walks over all stmts involved in the SLP load done
1014      in NODE verifying we can hoist them up to the first stmt in the
1015      group.  */
1016   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1017   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1018
1019   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1020     {
1021       stmt_vec_info access_info
1022         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1023       if (access_info == first_access_info)
1024         continue;
1025       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1026       ao_ref ref;
1027       bool ref_initialized_p = false;
1028       hash_set<stmt_vec_info> grp_visited;
1029       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1030            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1031         {
1032           gimple *stmt = gsi_stmt (gsi);
1033           if (! gimple_vdef (stmt))
1034             continue;
1035
1036           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1037
1038           /* If we run into a store of this same instance (we've just
1039              marked those) then delay dependence checking until we run
1040              into the last store because this is where it will have
1041              been sunk to (and we verified that we can do that already).  */
1042           if (gimple_visited_p (stmt))
1043             {
1044               if (stmt_info != last_store_info)
1045                 continue;
1046
1047               for (stmt_vec_info &store_info : stores)
1048                 {
1049                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1050                   ddr_p ddr = initialize_data_dependence_relation
1051                                 (dr_a, store_dr, vNULL);
1052                   bool dependent
1053                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1054                   free_dependence_relation (ddr);
1055                   if (dependent)
1056                     return false;
1057                 }
1058               continue;
1059             }
1060
1061           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1062             {
1063               /* We are hoisting a load - this means we can use TBAA for
1064                  disambiguation.  */
1065               if (!ref_initialized_p)
1066                 ao_ref_init (&ref, DR_REF (dr_a));
1067               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1068                 {
1069                   /* If we couldn't record a (single) data reference for this
1070                      stmt we have to give up now.  */
1071                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1072                   if (!dr_b)
1073                     return false;
1074                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1075                                                                    dr_b, vNULL);
1076                   bool dependent
1077                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1078                   free_dependence_relation (ddr);
1079                   if (dependent)
1080                     return false;
1081                 }
1082               /* No dependence.  */
1083               return true;
1084             };
1085           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1086             {
1087               /* When we run into a store group we have to honor
1088                  that earlier stores might be moved here.  We don't
1089                  know exactly which and where to since we lack a
1090                  back-mapping from DR to SLP node, so assume all
1091                  earlier stores are sunk here.  It's enough to
1092                  consider the last stmt of a group for this.
1093                  ???  Both this and the fact that we disregard that
1094                  the conflicting instance might be removed later
1095                  is overly conservative.  */
1096               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1097                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1098                      store_info != NULL;
1099                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1100                   if ((store_info == stmt_info
1101                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1102                       && !check_hoist (store_info))
1103                     return false;
1104             }
1105           else
1106             {
1107               if (!check_hoist (stmt_info))
1108                 return false;
1109             }
1110         }
1111     }
1112   return true;
1113 }
1114
1115
1116 /* Function vect_analyze_data_ref_dependences.
1117
1118    Examine all the data references in the basic-block, and make sure there
1119    do not exist any data dependences between them.  Set *MAX_VF according to
1120    the maximum vectorization factor the data dependences allow.  */
1121
1122 bool
1123 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1124 {
1125   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1126
1127   /* The stores of this instance are at the root of the SLP tree.  */
1128   slp_tree store = NULL;
1129   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1130     store = SLP_INSTANCE_TREE (instance);
1131
1132   /* Verify we can sink stores to the vectorized stmt insert location.  */
1133   stmt_vec_info last_store_info = NULL;
1134   if (store)
1135     {
1136       if (! vect_slp_analyze_store_dependences (vinfo, store))
1137         return false;
1138
1139       /* Mark stores in this instance and remember the last one.  */
1140       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1141       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1142         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1143     }
1144
1145   bool res = true;
1146
1147   /* Verify we can sink loads to the vectorized stmt insert location,
1148      special-casing stores of this instance.  */
1149   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1150     if (! vect_slp_analyze_load_dependences (vinfo, load,
1151                                              store
1152                                              ? SLP_TREE_SCALAR_STMTS (store)
1153                                              : vNULL, last_store_info))
1154       {
1155         res = false;
1156         break;
1157       }
1158
1159   /* Unset the visited flag.  */
1160   if (store)
1161     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1162       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1163
1164   return res;
1165 }
1166
1167 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1168    applied.  */
1169
1170 int
1171 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1172 {
1173   HOST_WIDE_INT diff = 0;
1174   /* Alignment is only analyzed for the first element of a DR group,
1175      use that but adjust misalignment by the offset of the access.  */
1176   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1177     {
1178       dr_vec_info *first_dr
1179         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1180       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1181          INTEGER_CSTs and the first element in the group has the lowest
1182          address.  */
1183       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1184               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1185       gcc_assert (diff >= 0);
1186       dr_info = first_dr;
1187     }
1188
1189   int misalign = dr_info->misalignment;
1190   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1191   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1192     return misalign;
1193
1194   /* If the access is only aligned for a vector type with smaller alignment
1195      requirement the access has unknown misalignment.  */
1196   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1197                 targetm.vectorize.preferred_vector_alignment (vectype)))
1198     return DR_MISALIGNMENT_UNKNOWN;
1199
1200   /* Apply the offset from the DR group start and the externally supplied
1201      offset which can for example result from a negative stride access.  */
1202   poly_int64 misalignment = misalign + diff + offset;
1203
1204   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1205      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1206   unsigned HOST_WIDE_INT target_alignment_c
1207     = dr_info->target_alignment.to_constant ();
1208   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1209     return DR_MISALIGNMENT_UNKNOWN;
1210   return misalign;
1211 }
1212
1213 /* Record the base alignment guarantee given by DRB, which occurs
1214    in STMT_INFO.  */
1215
1216 static void
1217 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1218                             innermost_loop_behavior *drb)
1219 {
1220   bool existed;
1221   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1222     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1223   if (!existed || entry.second->base_alignment < drb->base_alignment)
1224     {
1225       entry = std::make_pair (stmt_info, drb);
1226       if (dump_enabled_p ())
1227         dump_printf_loc (MSG_NOTE, vect_location,
1228                          "recording new base alignment for %T\n"
1229                          "  alignment:    %d\n"
1230                          "  misalignment: %d\n"
1231                          "  based on:     %G",
1232                          drb->base_address,
1233                          drb->base_alignment,
1234                          drb->base_misalignment,
1235                          stmt_info->stmt);
1236     }
1237 }
1238
1239 /* If the region we're going to vectorize is reached, all unconditional
1240    data references occur at least once.  We can therefore pool the base
1241    alignment guarantees from each unconditional reference.  Do this by
1242    going through all the data references in VINFO and checking whether
1243    the containing statement makes the reference unconditionally.  If so,
1244    record the alignment of the base address in VINFO so that it can be
1245    used for all other references with the same base.  */
1246
1247 void
1248 vect_record_base_alignments (vec_info *vinfo)
1249 {
1250   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1251   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1252   for (data_reference *dr : vinfo->shared->datarefs)
1253     {
1254       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1255       stmt_vec_info stmt_info = dr_info->stmt;
1256       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1257           && STMT_VINFO_VECTORIZABLE (stmt_info)
1258           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1259         {
1260           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1261
1262           /* If DR is nested in the loop that is being vectorized, we can also
1263              record the alignment of the base wrt the outer loop.  */
1264           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1265             vect_record_base_alignment
1266               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1267         }
1268     }
1269 }
1270
1271 /* Function vect_compute_data_ref_alignment
1272
1273    Compute the misalignment of the data reference DR_INFO when vectorizing
1274    with VECTYPE.
1275
1276    Output:
1277    1. initialized misalignment info for DR_INFO
1278
1279    FOR NOW: No analysis is actually performed. Misalignment is calculated
1280    only for trivial cases. TODO.  */
1281
1282 static void
1283 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1284                                  tree vectype)
1285 {
1286   stmt_vec_info stmt_info = dr_info->stmt;
1287   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1288   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1289   class loop *loop = NULL;
1290   tree ref = DR_REF (dr_info->dr);
1291
1292   if (dump_enabled_p ())
1293     dump_printf_loc (MSG_NOTE, vect_location,
1294                      "vect_compute_data_ref_alignment:\n");
1295
1296   if (loop_vinfo)
1297     loop = LOOP_VINFO_LOOP (loop_vinfo);
1298
1299   /* Initialize misalignment to unknown.  */
1300   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1301
1302   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1303     return;
1304
1305   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1306   bool step_preserves_misalignment_p;
1307
1308   poly_uint64 vector_alignment
1309     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1310                  BITS_PER_UNIT);
1311   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1312
1313   /* If the main loop has peeled for alignment we have no way of knowing
1314      whether the data accesses in the epilogues are aligned.  We can't at
1315      compile time answer the question whether we have entered the main loop or
1316      not.  Fixes PR 92351.  */
1317   if (loop_vinfo)
1318     {
1319       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1320       if (orig_loop_vinfo
1321           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1322         return;
1323     }
1324
1325   unsigned HOST_WIDE_INT vect_align_c;
1326   if (!vector_alignment.is_constant (&vect_align_c))
1327     return;
1328
1329   /* No step for BB vectorization.  */
1330   if (!loop)
1331     {
1332       gcc_assert (integer_zerop (drb->step));
1333       step_preserves_misalignment_p = true;
1334     }
1335
1336   /* In case the dataref is in an inner-loop of the loop that is being
1337      vectorized (LOOP), we use the base and misalignment information
1338      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1339      stays the same throughout the execution of the inner-loop, which is why
1340      we have to check that the stride of the dataref in the inner-loop evenly
1341      divides by the vector alignment.  */
1342   else if (nested_in_vect_loop_p (loop, stmt_info))
1343     {
1344       step_preserves_misalignment_p
1345         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1346
1347       if (dump_enabled_p ())
1348         {
1349           if (step_preserves_misalignment_p)
1350             dump_printf_loc (MSG_NOTE, vect_location,
1351                              "inner step divides the vector alignment.\n");
1352           else
1353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1354                              "inner step doesn't divide the vector"
1355                              " alignment.\n");
1356         }
1357     }
1358
1359   /* Similarly we can only use base and misalignment information relative to
1360      an innermost loop if the misalignment stays the same throughout the
1361      execution of the loop.  As above, this is the case if the stride of
1362      the dataref evenly divides by the alignment.  */
1363   else
1364     {
1365       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1366       step_preserves_misalignment_p
1367         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1368
1369       if (!step_preserves_misalignment_p && dump_enabled_p ())
1370         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                          "step doesn't divide the vector alignment.\n");
1372     }
1373
1374   unsigned int base_alignment = drb->base_alignment;
1375   unsigned int base_misalignment = drb->base_misalignment;
1376
1377   /* Calculate the maximum of the pooled base address alignment and the
1378      alignment that we can compute for DR itself.  */
1379   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1380     = base_alignments->get (drb->base_address);
1381   if (entry
1382       && base_alignment < (*entry).second->base_alignment
1383       && (loop_vinfo
1384           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1385                               gimple_bb (entry->first->stmt))
1386               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1387                   || (entry->first->dr_aux.group <= dr_info->group)))))
1388     {
1389       base_alignment = entry->second->base_alignment;
1390       base_misalignment = entry->second->base_misalignment;
1391     }
1392
1393   if (drb->offset_alignment < vect_align_c
1394       || !step_preserves_misalignment_p
1395       /* We need to know whether the step wrt the vectorized loop is
1396          negative when computing the starting misalignment below.  */
1397       || TREE_CODE (drb->step) != INTEGER_CST)
1398     {
1399       if (dump_enabled_p ())
1400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1401                          "Unknown alignment for access: %T\n", ref);
1402       return;
1403     }
1404
1405   if (base_alignment < vect_align_c)
1406     {
1407       unsigned int max_alignment;
1408       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1409       if (max_alignment < vect_align_c
1410           || !vect_can_force_dr_alignment_p (base,
1411                                              vect_align_c * BITS_PER_UNIT))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_NOTE, vect_location,
1415                              "can't force alignment of ref: %T\n", ref);
1416           return;
1417         }
1418
1419       /* Force the alignment of the decl.
1420          NOTE: This is the only change to the code we make during
1421          the analysis phase, before deciding to vectorize the loop.  */
1422       if (dump_enabled_p ())
1423         dump_printf_loc (MSG_NOTE, vect_location,
1424                          "force alignment of %T\n", ref);
1425
1426       dr_info->base_decl = base;
1427       dr_info->base_misaligned = true;
1428       base_misalignment = 0;
1429     }
1430   poly_int64 misalignment
1431     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1432
1433   unsigned int const_misalignment;
1434   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1435     {
1436       if (dump_enabled_p ())
1437         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                          "Non-constant misalignment for access: %T\n", ref);
1439       return;
1440     }
1441
1442   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1443
1444   if (dump_enabled_p ())
1445     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1446                      "misalign = %d bytes of ref %T\n",
1447                      const_misalignment, ref);
1448
1449   return;
1450 }
1451
1452 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1453    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1454    is made aligned via peeling.  */
1455
1456 static bool
1457 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1458                                          dr_vec_info *dr_peel_info)
1459 {
1460   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1461                   DR_TARGET_ALIGNMENT (dr_info)))
1462     {
1463       poly_offset_int diff
1464         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1465            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1466       if (known_eq (diff, 0)
1467           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1468         return true;
1469     }
1470   return false;
1471 }
1472
1473 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1474    aligned via peeling.  */
1475
1476 static bool
1477 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1478                                  dr_vec_info *dr_peel_info)
1479 {
1480   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1481                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1482       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1483                            DR_OFFSET (dr_peel_info->dr), 0)
1484       || !operand_equal_p (DR_STEP (dr_info->dr),
1485                            DR_STEP (dr_peel_info->dr), 0))
1486     return false;
1487
1488   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1489 }
1490
1491 /* Compute the value for dr_info->misalign so that the access appears
1492    aligned.  This is used by peeling to compensate for dr_misalignment
1493    applying the offset for negative step.  */
1494
1495 int
1496 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1497 {
1498   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1499     return 0;
1500
1501   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1502   poly_int64 misalignment
1503     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1504        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1505
1506   unsigned HOST_WIDE_INT target_alignment_c;
1507   int misalign;
1508   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1509       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1510     return DR_MISALIGNMENT_UNKNOWN;
1511   return misalign;
1512 }
1513
1514 /* Function vect_update_misalignment_for_peel.
1515    Sets DR_INFO's misalignment
1516    - to 0 if it has the same alignment as DR_PEEL_INFO,
1517    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1518    - to -1 (unknown) otherwise.
1519
1520    DR_INFO - the data reference whose misalignment is to be adjusted.
1521    DR_PEEL_INFO - the data reference whose misalignment is being made
1522                   zero in the vector loop by the peel.
1523    NPEEL - the number of iterations in the peel loop if the misalignment
1524            of DR_PEEL_INFO is known at compile time.  */
1525
1526 static void
1527 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1528                                    dr_vec_info *dr_peel_info, int npeel)
1529 {
1530   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1531   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1532     {
1533       SET_DR_MISALIGNMENT (dr_info,
1534                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1535       return;
1536     }
1537
1538   unsigned HOST_WIDE_INT alignment;
1539   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1540       && known_alignment_for_access_p (dr_info,
1541                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1542       && known_alignment_for_access_p (dr_peel_info,
1543                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1544     {
1545       int misal = dr_info->misalignment;
1546       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1547       misal &= alignment - 1;
1548       set_dr_misalignment (dr_info, misal);
1549       return;
1550     }
1551
1552   if (dump_enabled_p ())
1553     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1554                      "to unknown (-1).\n");
1555   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1556 }
1557
1558 /* Return true if alignment is relevant for DR_INFO.  */
1559
1560 static bool
1561 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1562 {
1563   stmt_vec_info stmt_info = dr_info->stmt;
1564
1565   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1566     return false;
1567
1568   /* For interleaving, only the alignment of the first access matters.  */
1569   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1570       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1571     return false;
1572
1573   /* Scatter-gather and invariant accesses continue to address individual
1574      scalars, so vector-level alignment is irrelevant.  */
1575   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1576       || integer_zerop (DR_STEP (dr_info->dr)))
1577     return false;
1578
1579   /* Strided accesses perform only component accesses, alignment is
1580      irrelevant for them.  */
1581   if (STMT_VINFO_STRIDED_P (stmt_info)
1582       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1583     return false;
1584
1585   return true;
1586 }
1587
1588 /* Given an memory reference EXP return whether its alignment is less
1589    than its size.  */
1590
1591 static bool
1592 not_size_aligned (tree exp)
1593 {
1594   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1595     return true;
1596
1597   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1598           > get_object_alignment (exp));
1599 }
1600
1601 /* Function vector_alignment_reachable_p
1602
1603    Return true if vector alignment for DR_INFO is reachable by peeling
1604    a few loop iterations.  Return false otherwise.  */
1605
1606 static bool
1607 vector_alignment_reachable_p (dr_vec_info *dr_info)
1608 {
1609   stmt_vec_info stmt_info = dr_info->stmt;
1610   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1611
1612   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1613     {
1614       /* For interleaved access we peel only if number of iterations in
1615          the prolog loop ({VF - misalignment}), is a multiple of the
1616          number of the interleaved accesses.  */
1617       int elem_size, mis_in_elements;
1618
1619       /* FORNOW: handle only known alignment.  */
1620       if (!known_alignment_for_access_p (dr_info, vectype))
1621         return false;
1622
1623       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1624       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1625       elem_size = vector_element_size (vector_size, nelements);
1626       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1627
1628       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1629         return false;
1630     }
1631
1632   /* If misalignment is known at the compile time then allow peeling
1633      only if natural alignment is reachable through peeling.  */
1634   if (known_alignment_for_access_p (dr_info, vectype)
1635       && !aligned_access_p (dr_info, vectype))
1636     {
1637       HOST_WIDE_INT elmsize =
1638                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1639       if (dump_enabled_p ())
1640         {
1641           dump_printf_loc (MSG_NOTE, vect_location,
1642                            "data size = %wd. misalignment = %d.\n", elmsize,
1643                            dr_misalignment (dr_info, vectype));
1644         }
1645       if (dr_misalignment (dr_info, vectype) % elmsize)
1646         {
1647           if (dump_enabled_p ())
1648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                              "data size does not divide the misalignment.\n");
1650           return false;
1651         }
1652     }
1653
1654   if (!known_alignment_for_access_p (dr_info, vectype))
1655     {
1656       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1657       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660                          "Unknown misalignment, %snaturally aligned\n",
1661                          is_packed ? "not " : "");
1662       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1663     }
1664
1665   return true;
1666 }
1667
1668
1669 /* Calculate the cost of the memory access represented by DR_INFO.  */
1670
1671 static void
1672 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1673                            dr_alignment_support alignment_support_scheme,
1674                            int misalignment,
1675                            unsigned int *inside_cost,
1676                            unsigned int *outside_cost,
1677                            stmt_vector_for_cost *body_cost_vec,
1678                            stmt_vector_for_cost *prologue_cost_vec)
1679 {
1680   stmt_vec_info stmt_info = dr_info->stmt;
1681   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1682   int ncopies;
1683
1684   if (PURE_SLP_STMT (stmt_info))
1685     ncopies = 1;
1686   else
1687     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1688
1689   if (DR_IS_READ (dr_info->dr))
1690     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1691                         misalignment, true, inside_cost,
1692                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1693   else
1694     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1695                          misalignment, inside_cost, body_cost_vec);
1696
1697   if (dump_enabled_p ())
1698     dump_printf_loc (MSG_NOTE, vect_location,
1699                      "vect_get_data_access_cost: inside_cost = %d, "
1700                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1701 }
1702
1703
1704 typedef struct _vect_peel_info
1705 {
1706   dr_vec_info *dr_info;
1707   int npeel;
1708   unsigned int count;
1709 } *vect_peel_info;
1710
1711 typedef struct _vect_peel_extended_info
1712 {
1713   vec_info *vinfo;
1714   struct _vect_peel_info peel_info;
1715   unsigned int inside_cost;
1716   unsigned int outside_cost;
1717 } *vect_peel_extended_info;
1718
1719
1720 /* Peeling hashtable helpers.  */
1721
1722 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1723 {
1724   static inline hashval_t hash (const _vect_peel_info *);
1725   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1726 };
1727
1728 inline hashval_t
1729 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1730 {
1731   return (hashval_t) peel_info->npeel;
1732 }
1733
1734 inline bool
1735 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1736 {
1737   return (a->npeel == b->npeel);
1738 }
1739
1740
1741 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1742
1743 static void
1744 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1745                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1746                           int npeel, bool supportable_if_not_aligned)
1747 {
1748   struct _vect_peel_info elem, *slot;
1749   _vect_peel_info **new_slot;
1750
1751   elem.npeel = npeel;
1752   slot = peeling_htab->find (&elem);
1753   if (slot)
1754     slot->count++;
1755   else
1756     {
1757       slot = XNEW (struct _vect_peel_info);
1758       slot->npeel = npeel;
1759       slot->dr_info = dr_info;
1760       slot->count = 1;
1761       new_slot = peeling_htab->find_slot (slot, INSERT);
1762       *new_slot = slot;
1763     }
1764
1765   /* If this DR is not supported with unknown misalignment then bias
1766      this slot when the cost model is disabled.  */
1767   if (!supportable_if_not_aligned
1768       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1769     slot->count += VECT_MAX_COST;
1770 }
1771
1772
1773 /* Traverse peeling hash table to find peeling option that aligns maximum
1774    number of data accesses.  */
1775
1776 int
1777 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1778                                      _vect_peel_extended_info *max)
1779 {
1780   vect_peel_info elem = *slot;
1781
1782   if (elem->count > max->peel_info.count
1783       || (elem->count == max->peel_info.count
1784           && max->peel_info.npeel > elem->npeel))
1785     {
1786       max->peel_info.npeel = elem->npeel;
1787       max->peel_info.count = elem->count;
1788       max->peel_info.dr_info = elem->dr_info;
1789     }
1790
1791   return 1;
1792 }
1793
1794 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1795    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1796    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1797    after peeling.  */
1798
1799 static void
1800 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1801                                 dr_vec_info *dr0_info,
1802                                 unsigned int *inside_cost,
1803                                 unsigned int *outside_cost,
1804                                 stmt_vector_for_cost *body_cost_vec,
1805                                 stmt_vector_for_cost *prologue_cost_vec,
1806                                 unsigned int npeel)
1807 {
1808   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1809
1810   bool dr0_alignment_known_p
1811     = (dr0_info
1812        && known_alignment_for_access_p (dr0_info,
1813                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1814
1815   for (data_reference *dr : datarefs)
1816     {
1817       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1818       if (!vect_relevant_for_alignment_p (dr_info))
1819         continue;
1820
1821       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1822       dr_alignment_support alignment_support_scheme;
1823       int misalignment;
1824       unsigned HOST_WIDE_INT alignment;
1825
1826       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1827                                             size_zero_node) < 0;
1828       poly_int64 off = 0;
1829       if (negative)
1830         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1831                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1832
1833       if (npeel == 0)
1834         misalignment = dr_misalignment (dr_info, vectype, off);
1835       else if (dr_info == dr0_info
1836                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1837         misalignment = 0;
1838       else if (!dr0_alignment_known_p
1839                || !known_alignment_for_access_p (dr_info, vectype)
1840                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1841         misalignment = DR_MISALIGNMENT_UNKNOWN;
1842       else
1843         {
1844           misalignment = dr_misalignment (dr_info, vectype, off);
1845           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1846           misalignment &= alignment - 1;
1847         }
1848       alignment_support_scheme
1849         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1850                                          misalignment);
1851
1852       vect_get_data_access_cost (loop_vinfo, dr_info,
1853                                  alignment_support_scheme, misalignment,
1854                                  inside_cost, outside_cost,
1855                                  body_cost_vec, prologue_cost_vec);
1856     }
1857 }
1858
1859 /* Traverse peeling hash table and calculate cost for each peeling option.
1860    Find the one with the lowest cost.  */
1861
1862 int
1863 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1864                                    _vect_peel_extended_info *min)
1865 {
1866   vect_peel_info elem = *slot;
1867   int dummy;
1868   unsigned int inside_cost = 0, outside_cost = 0;
1869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1870   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1871                        epilogue_cost_vec;
1872
1873   prologue_cost_vec.create (2);
1874   body_cost_vec.create (2);
1875   epilogue_cost_vec.create (2);
1876
1877   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1878                                   &outside_cost, &body_cost_vec,
1879                                   &prologue_cost_vec, elem->npeel);
1880
1881   body_cost_vec.release ();
1882
1883   outside_cost += vect_get_known_peeling_cost
1884     (loop_vinfo, elem->npeel, &dummy,
1885      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1886      &prologue_cost_vec, &epilogue_cost_vec);
1887
1888   /* Prologue and epilogue costs are added to the target model later.
1889      These costs depend only on the scalar iteration cost, the
1890      number of peeling iterations finally chosen, and the number of
1891      misaligned statements.  So discard the information found here.  */
1892   prologue_cost_vec.release ();
1893   epilogue_cost_vec.release ();
1894
1895   if (inside_cost < min->inside_cost
1896       || (inside_cost == min->inside_cost
1897           && outside_cost < min->outside_cost))
1898     {
1899       min->inside_cost = inside_cost;
1900       min->outside_cost = outside_cost;
1901       min->peel_info.dr_info = elem->dr_info;
1902       min->peel_info.npeel = elem->npeel;
1903       min->peel_info.count = elem->count;
1904     }
1905
1906   return 1;
1907 }
1908
1909
1910 /* Choose best peeling option by traversing peeling hash table and either
1911    choosing an option with the lowest cost (if cost model is enabled) or the
1912    option that aligns as many accesses as possible.  */
1913
1914 static struct _vect_peel_extended_info
1915 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1916                                        loop_vec_info loop_vinfo)
1917 {
1918    struct _vect_peel_extended_info res;
1919
1920    res.peel_info.dr_info = NULL;
1921    res.vinfo = loop_vinfo;
1922
1923    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1924      {
1925        res.inside_cost = INT_MAX;
1926        res.outside_cost = INT_MAX;
1927        peeling_htab->traverse <_vect_peel_extended_info *,
1928                                vect_peeling_hash_get_lowest_cost> (&res);
1929      }
1930    else
1931      {
1932        res.peel_info.count = 0;
1933        peeling_htab->traverse <_vect_peel_extended_info *,
1934                                vect_peeling_hash_get_most_frequent> (&res);
1935        res.inside_cost = 0;
1936        res.outside_cost = 0;
1937      }
1938
1939    return res;
1940 }
1941
1942 /* Return true if the new peeling NPEEL is supported.  */
1943
1944 static bool
1945 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1946                           unsigned npeel)
1947 {
1948   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1949   enum dr_alignment_support supportable_dr_alignment;
1950
1951   bool dr0_alignment_known_p
1952     = known_alignment_for_access_p (dr0_info,
1953                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1954
1955   /* Ensure that all data refs can be vectorized after the peel.  */
1956   for (data_reference *dr : datarefs)
1957     {
1958       if (dr == dr0_info->dr)
1959         continue;
1960
1961       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1962       if (!vect_relevant_for_alignment_p (dr_info)
1963           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1964         continue;
1965
1966       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1967       int misalignment;
1968       unsigned HOST_WIDE_INT alignment;
1969       if (!dr0_alignment_known_p
1970           || !known_alignment_for_access_p (dr_info, vectype)
1971           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1972         misalignment = DR_MISALIGNMENT_UNKNOWN;
1973       else
1974         {
1975           misalignment = dr_misalignment (dr_info, vectype);
1976           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1977           misalignment &= alignment - 1;
1978         }
1979       supportable_dr_alignment
1980         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1981                                          misalignment);
1982       if (supportable_dr_alignment == dr_unaligned_unsupported)
1983         return false;
1984     }
1985
1986   return true;
1987 }
1988
1989 /* Compare two data-references DRA and DRB to group them into chunks
1990    with related alignment.  */
1991
1992 static int
1993 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
1994 {
1995   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
1996   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
1997   int cmp;
1998
1999   /* Stabilize sort.  */
2000   if (dra == drb)
2001     return 0;
2002
2003   /* Ordering of DRs according to base.  */
2004   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2005                                DR_BASE_ADDRESS (drb));
2006   if (cmp != 0)
2007     return cmp;
2008
2009   /* And according to DR_OFFSET.  */
2010   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2011   if (cmp != 0)
2012     return cmp;
2013
2014   /* And after step.  */
2015   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2016   if (cmp != 0)
2017     return cmp;
2018
2019   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2020   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2021   if (cmp == 0)
2022     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2023   return cmp;
2024 }
2025
2026 /* Function vect_enhance_data_refs_alignment
2027
2028    This pass will use loop versioning and loop peeling in order to enhance
2029    the alignment of data references in the loop.
2030
2031    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2032    original loop is to be vectorized.  Any other loops that are created by
2033    the transformations performed in this pass - are not supposed to be
2034    vectorized.  This restriction will be relaxed.
2035
2036    This pass will require a cost model to guide it whether to apply peeling
2037    or versioning or a combination of the two.  For example, the scheme that
2038    intel uses when given a loop with several memory accesses, is as follows:
2039    choose one memory access ('p') which alignment you want to force by doing
2040    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2041    other accesses are not necessarily aligned, or (2) use loop versioning to
2042    generate one loop in which all accesses are aligned, and another loop in
2043    which only 'p' is necessarily aligned.
2044
2045    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2046    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2047    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2048
2049    Devising a cost model is the most critical aspect of this work.  It will
2050    guide us on which access to peel for, whether to use loop versioning, how
2051    many versions to create, etc.  The cost model will probably consist of
2052    generic considerations as well as target specific considerations (on
2053    powerpc for example, misaligned stores are more painful than misaligned
2054    loads).
2055
2056    Here are the general steps involved in alignment enhancements:
2057
2058      -- original loop, before alignment analysis:
2059         for (i=0; i<N; i++){
2060           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2061           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2062         }
2063
2064      -- After vect_compute_data_refs_alignment:
2065         for (i=0; i<N; i++){
2066           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2067           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2068         }
2069
2070      -- Possibility 1: we do loop versioning:
2071      if (p is aligned) {
2072         for (i=0; i<N; i++){    # loop 1A
2073           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2074           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2075         }
2076      }
2077      else {
2078         for (i=0; i<N; i++){    # loop 1B
2079           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2080           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2081         }
2082      }
2083
2084      -- Possibility 2: we do loop peeling:
2085      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2086         x = q[i];
2087         p[i] = y;
2088      }
2089      for (i = 3; i < N; i++){   # loop 2A
2090         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2091         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2092      }
2093
2094      -- Possibility 3: combination of loop peeling and versioning:
2095      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2096         x = q[i];
2097         p[i] = y;
2098      }
2099      if (p is aligned) {
2100         for (i = 3; i<N; i++){  # loop 3A
2101           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2102           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2103         }
2104      }
2105      else {
2106         for (i = 3; i<N; i++){  # loop 3B
2107           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2108           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2109         }
2110      }
2111
2112      These loops are later passed to loop_transform to be vectorized.  The
2113      vectorizer will use the alignment information to guide the transformation
2114      (whether to generate regular loads/stores, or with special handling for
2115      misalignment).  */
2116
2117 opt_result
2118 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2119 {
2120   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2121   dr_vec_info *first_store = NULL;
2122   dr_vec_info *dr0_info = NULL;
2123   struct data_reference *dr;
2124   unsigned int i;
2125   bool do_peeling = false;
2126   bool do_versioning = false;
2127   unsigned int npeel = 0;
2128   bool one_misalignment_known = false;
2129   bool one_misalignment_unknown = false;
2130   bool one_dr_unsupportable = false;
2131   dr_vec_info *unsupportable_dr_info = NULL;
2132   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2133   hash_table<peel_info_hasher> peeling_htab (1);
2134
2135   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2136
2137   /* Reset data so we can safely be called multiple times.  */
2138   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2139   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2140
2141   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2142     return opt_result::success ();
2143
2144   /* Sort the vector of datarefs so DRs that have the same or dependent
2145      alignment are next to each other.  */
2146   auto_vec<data_reference_p> datarefs
2147     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2148   datarefs.qsort (dr_align_group_sort_cmp);
2149
2150   /* Compute the number of DRs that become aligned when we peel
2151      a dataref so it becomes aligned.  */
2152   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2153   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2154   unsigned i0;
2155   for (i0 = 0; i0 < datarefs.length (); ++i0)
2156     if (DR_BASE_ADDRESS (datarefs[i0]))
2157       break;
2158   for (i = i0 + 1; i <= datarefs.length (); ++i)
2159     {
2160       if (i == datarefs.length ()
2161           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2162                                DR_BASE_ADDRESS (datarefs[i]), 0)
2163           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2164                                DR_OFFSET (datarefs[i]), 0)
2165           || !operand_equal_p (DR_STEP (datarefs[i0]),
2166                                DR_STEP (datarefs[i]), 0))
2167         {
2168           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2169              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2170              will get known misalignment if we align one of the refs
2171              with the largest DR_TARGET_ALIGNMENT.  */
2172           for (unsigned j = i0; j < i; ++j)
2173             {
2174               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2175               for (unsigned k = i0; k < i; ++k)
2176                 {
2177                   if (k == j)
2178                     continue;
2179                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2180                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2181                                                                dr_infoj))
2182                     n_same_align_refs[j]++;
2183                 }
2184             }
2185           i0 = i;
2186         }
2187     }
2188
2189   /* While cost model enhancements are expected in the future, the high level
2190      view of the code at this time is as follows:
2191
2192      A) If there is a misaligned access then see if peeling to align
2193         this access can make all data references satisfy
2194         vect_supportable_dr_alignment.  If so, update data structures
2195         as needed and return true.
2196
2197      B) If peeling wasn't possible and there is a data reference with an
2198         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2199         then see if loop versioning checks can be used to make all data
2200         references satisfy vect_supportable_dr_alignment.  If so, update
2201         data structures as needed and return true.
2202
2203      C) If neither peeling nor versioning were successful then return false if
2204         any data reference does not satisfy vect_supportable_dr_alignment.
2205
2206      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2207
2208      Note, Possibility 3 above (which is peeling and versioning together) is not
2209      being done at this time.  */
2210
2211   /* (1) Peeling to force alignment.  */
2212
2213   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2214      Considerations:
2215      + How many accesses will become aligned due to the peeling
2216      - How many accesses will become unaligned due to the peeling,
2217        and the cost of misaligned accesses.
2218      - The cost of peeling (the extra runtime checks, the increase
2219        in code size).  */
2220
2221   FOR_EACH_VEC_ELT (datarefs, i, dr)
2222     {
2223       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2224       if (!vect_relevant_for_alignment_p (dr_info))
2225         continue;
2226
2227       stmt_vec_info stmt_info = dr_info->stmt;
2228       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2229       do_peeling = vector_alignment_reachable_p (dr_info);
2230       if (do_peeling)
2231         {
2232           if (known_alignment_for_access_p (dr_info, vectype))
2233             {
2234               unsigned int npeel_tmp = 0;
2235               bool negative = tree_int_cst_compare (DR_STEP (dr),
2236                                                     size_zero_node) < 0;
2237
2238               /* If known_alignment_for_access_p then we have set
2239                  DR_MISALIGNMENT which is only done if we know it at compiler
2240                  time, so it is safe to assume target alignment is constant.
2241                */
2242               unsigned int target_align =
2243                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2244               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2245               poly_int64 off = 0;
2246               if (negative)
2247                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2248               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2249               mis = negative ? mis : -mis;
2250               if (mis != 0)
2251                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2252
2253               /* For multiple types, it is possible that the bigger type access
2254                  will have more than one peeling option.  E.g., a loop with two
2255                  types: one of size (vector size / 4), and the other one of
2256                  size (vector size / 8).  Vectorization factor will 8.  If both
2257                  accesses are misaligned by 3, the first one needs one scalar
2258                  iteration to be aligned, and the second one needs 5.  But the
2259                  first one will be aligned also by peeling 5 scalar
2260                  iterations, and in that case both accesses will be aligned.
2261                  Hence, except for the immediate peeling amount, we also want
2262                  to try to add full vector size, while we don't exceed
2263                  vectorization factor.
2264                  We do this automatically for cost model, since we calculate
2265                  cost for every peeling option.  */
2266               poly_uint64 nscalars = npeel_tmp;
2267               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2268                 {
2269                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2270                   nscalars = (STMT_SLP_TYPE (stmt_info)
2271                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2272                 }
2273
2274               /* Save info about DR in the hash table.  Also include peeling
2275                  amounts according to the explanation above.  Indicate
2276                  the alignment status when the ref is not aligned.
2277                  ???  Rather than using unknown alignment here we should
2278                  prune all entries from the peeling hashtable which cause
2279                  DRs to be not supported.  */
2280               bool supportable_if_not_aligned
2281                 = vect_supportable_dr_alignment
2282                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2283               while (known_le (npeel_tmp, nscalars))
2284                 {
2285                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2286                                             dr_info, npeel_tmp,
2287                                             supportable_if_not_aligned);
2288                   npeel_tmp += MAX (1, target_align / dr_size);
2289                 }
2290
2291               one_misalignment_known = true;
2292             }
2293           else
2294             {
2295               /* If we don't know any misalignment values, we prefer
2296                  peeling for data-ref that has the maximum number of data-refs
2297                  with the same alignment, unless the target prefers to align
2298                  stores over load.  */
2299               unsigned same_align_drs = n_same_align_refs[i];
2300               if (!dr0_info
2301                   || dr0_same_align_drs < same_align_drs)
2302                 {
2303                   dr0_same_align_drs = same_align_drs;
2304                   dr0_info = dr_info;
2305                 }
2306               /* For data-refs with the same number of related
2307                  accesses prefer the one where the misalign
2308                  computation will be invariant in the outermost loop.  */
2309               else if (dr0_same_align_drs == same_align_drs)
2310                 {
2311                   class loop *ivloop0, *ivloop;
2312                   ivloop0 = outermost_invariant_loop_for_expr
2313                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2314                   ivloop = outermost_invariant_loop_for_expr
2315                     (loop, DR_BASE_ADDRESS (dr));
2316                   if ((ivloop && !ivloop0)
2317                       || (ivloop && ivloop0
2318                           && flow_loop_nested_p (ivloop, ivloop0)))
2319                     dr0_info = dr_info;
2320                 }
2321
2322               one_misalignment_unknown = true;
2323
2324               /* Check for data refs with unsupportable alignment that
2325                  can be peeled.  */
2326               enum dr_alignment_support supportable_dr_alignment
2327                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2328                                                  DR_MISALIGNMENT_UNKNOWN);
2329               if (supportable_dr_alignment == dr_unaligned_unsupported)
2330                 {
2331                   one_dr_unsupportable = true;
2332                   unsupportable_dr_info = dr_info;
2333                 }
2334
2335               if (!first_store && DR_IS_WRITE (dr))
2336                 {
2337                   first_store = dr_info;
2338                   first_store_same_align_drs = same_align_drs;
2339                 }
2340             }
2341         }
2342       else
2343         {
2344           if (!aligned_access_p (dr_info, vectype))
2345             {
2346               if (dump_enabled_p ())
2347                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348                                  "vector alignment may not be reachable\n");
2349               break;
2350             }
2351         }
2352     }
2353
2354   /* Check if we can possibly peel the loop.  */
2355   if (!vect_can_advance_ivs_p (loop_vinfo)
2356       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2357                                        loop_preheader_edge (loop))
2358       || loop->inner
2359       || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2360     do_peeling = false;
2361
2362   struct _vect_peel_extended_info peel_for_known_alignment;
2363   struct _vect_peel_extended_info peel_for_unknown_alignment;
2364   struct _vect_peel_extended_info best_peel;
2365
2366   peel_for_unknown_alignment.inside_cost = INT_MAX;
2367   peel_for_unknown_alignment.outside_cost = INT_MAX;
2368   peel_for_unknown_alignment.peel_info.count = 0;
2369
2370   if (do_peeling
2371       && one_misalignment_unknown)
2372     {
2373       /* Check if the target requires to prefer stores over loads, i.e., if
2374          misaligned stores are more expensive than misaligned loads (taking
2375          drs with same alignment into account).  */
2376       unsigned int load_inside_cost = 0;
2377       unsigned int load_outside_cost = 0;
2378       unsigned int store_inside_cost = 0;
2379       unsigned int store_outside_cost = 0;
2380       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2381
2382       stmt_vector_for_cost dummy;
2383       dummy.create (2);
2384       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2385                                       &load_inside_cost,
2386                                       &load_outside_cost,
2387                                       &dummy, &dummy, estimated_npeels);
2388       dummy.release ();
2389
2390       if (first_store)
2391         {
2392           dummy.create (2);
2393           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2394                                           &store_inside_cost,
2395                                           &store_outside_cost,
2396                                           &dummy, &dummy,
2397                                           estimated_npeels);
2398           dummy.release ();
2399         }
2400       else
2401         {
2402           store_inside_cost = INT_MAX;
2403           store_outside_cost = INT_MAX;
2404         }
2405
2406       if (load_inside_cost > store_inside_cost
2407           || (load_inside_cost == store_inside_cost
2408               && load_outside_cost > store_outside_cost))
2409         {
2410           dr0_info = first_store;
2411           dr0_same_align_drs = first_store_same_align_drs;
2412           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2413           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2414         }
2415       else
2416         {
2417           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2418           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2419         }
2420
2421       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2422       prologue_cost_vec.create (2);
2423       epilogue_cost_vec.create (2);
2424
2425       int dummy2;
2426       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2427         (loop_vinfo, estimated_npeels, &dummy2,
2428          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2429          &prologue_cost_vec, &epilogue_cost_vec);
2430
2431       prologue_cost_vec.release ();
2432       epilogue_cost_vec.release ();
2433
2434       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2435     }
2436
2437   peel_for_unknown_alignment.peel_info.npeel = 0;
2438   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2439
2440   best_peel = peel_for_unknown_alignment;
2441
2442   peel_for_known_alignment.inside_cost = INT_MAX;
2443   peel_for_known_alignment.outside_cost = INT_MAX;
2444   peel_for_known_alignment.peel_info.count = 0;
2445   peel_for_known_alignment.peel_info.dr_info = NULL;
2446
2447   if (do_peeling && one_misalignment_known)
2448     {
2449       /* Peeling is possible, but there is no data access that is not supported
2450          unless aligned.  So we try to choose the best possible peeling from
2451          the hash table.  */
2452       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2453         (&peeling_htab, loop_vinfo);
2454     }
2455
2456   /* Compare costs of peeling for known and unknown alignment. */
2457   if (peel_for_known_alignment.peel_info.dr_info != NULL
2458       && peel_for_unknown_alignment.inside_cost
2459       >= peel_for_known_alignment.inside_cost)
2460     {
2461       best_peel = peel_for_known_alignment;
2462
2463       /* If the best peeling for known alignment has NPEEL == 0, perform no
2464          peeling at all except if there is an unsupportable dr that we can
2465          align.  */
2466       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2467         do_peeling = false;
2468     }
2469
2470   /* If there is an unsupportable data ref, prefer this over all choices so far
2471      since we'd have to discard a chosen peeling except when it accidentally
2472      aligned the unsupportable data ref.  */
2473   if (one_dr_unsupportable)
2474     dr0_info = unsupportable_dr_info;
2475   else if (do_peeling)
2476     {
2477       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2478          TODO: Use nopeel_outside_cost or get rid of it?  */
2479       unsigned nopeel_inside_cost = 0;
2480       unsigned nopeel_outside_cost = 0;
2481
2482       stmt_vector_for_cost dummy;
2483       dummy.create (2);
2484       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2485                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2486       dummy.release ();
2487
2488       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2489          costs will be recorded.  */
2490       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2491       prologue_cost_vec.create (2);
2492       epilogue_cost_vec.create (2);
2493
2494       int dummy2;
2495       nopeel_outside_cost += vect_get_known_peeling_cost
2496         (loop_vinfo, 0, &dummy2,
2497          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2498          &prologue_cost_vec, &epilogue_cost_vec);
2499
2500       prologue_cost_vec.release ();
2501       epilogue_cost_vec.release ();
2502
2503       npeel = best_peel.peel_info.npeel;
2504       dr0_info = best_peel.peel_info.dr_info;
2505
2506       /* If no peeling is not more expensive than the best peeling we
2507          have so far, don't perform any peeling.  */
2508       if (nopeel_inside_cost <= best_peel.inside_cost)
2509         do_peeling = false;
2510     }
2511
2512   if (do_peeling)
2513     {
2514       stmt_vec_info stmt_info = dr0_info->stmt;
2515       if (known_alignment_for_access_p (dr0_info,
2516                                         STMT_VINFO_VECTYPE (stmt_info)))
2517         {
2518           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2519                                                 size_zero_node) < 0;
2520           if (!npeel)
2521             {
2522               /* Since it's known at compile time, compute the number of
2523                  iterations in the peeled loop (the peeling factor) for use in
2524                  updating DR_MISALIGNMENT values.  The peeling factor is the
2525                  vectorization factor minus the misalignment as an element
2526                  count.  */
2527               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2528               poly_int64 off = 0;
2529               if (negative)
2530                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2531                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2532               unsigned int mis
2533                 = dr_misalignment (dr0_info, vectype, off);
2534               mis = negative ? mis : -mis;
2535               /* If known_alignment_for_access_p then we have set
2536                  DR_MISALIGNMENT which is only done if we know it at compiler
2537                  time, so it is safe to assume target alignment is constant.
2538                */
2539               unsigned int target_align =
2540                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2541               npeel = ((mis & (target_align - 1))
2542                        / vect_get_scalar_dr_size (dr0_info));
2543             }
2544
2545           /* For interleaved data access every iteration accesses all the
2546              members of the group, therefore we divide the number of iterations
2547              by the group size.  */
2548           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2549             npeel /= DR_GROUP_SIZE (stmt_info);
2550
2551           if (dump_enabled_p ())
2552             dump_printf_loc (MSG_NOTE, vect_location,
2553                              "Try peeling by %d\n", npeel);
2554         }
2555
2556       /* Ensure that all datarefs can be vectorized after the peel.  */
2557       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2558         do_peeling = false;
2559
2560       /* Check if all datarefs are supportable and log.  */
2561       if (do_peeling
2562           && npeel == 0
2563           && known_alignment_for_access_p (dr0_info,
2564                                            STMT_VINFO_VECTYPE (stmt_info)))
2565         return opt_result::success ();
2566
2567       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2568       if (do_peeling)
2569         {
2570           unsigned max_allowed_peel
2571             = param_vect_max_peeling_for_alignment;
2572           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2573             max_allowed_peel = 0;
2574           if (max_allowed_peel != (unsigned)-1)
2575             {
2576               unsigned max_peel = npeel;
2577               if (max_peel == 0)
2578                 {
2579                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2580                   unsigned HOST_WIDE_INT target_align_c;
2581                   if (target_align.is_constant (&target_align_c))
2582                     max_peel =
2583                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2584                   else
2585                     {
2586                       do_peeling = false;
2587                       if (dump_enabled_p ())
2588                         dump_printf_loc (MSG_NOTE, vect_location,
2589                           "Disable peeling, max peels set and vector"
2590                           " alignment unknown\n");
2591                     }
2592                 }
2593               if (max_peel > max_allowed_peel)
2594                 {
2595                   do_peeling = false;
2596                   if (dump_enabled_p ())
2597                     dump_printf_loc (MSG_NOTE, vect_location,
2598                         "Disable peeling, max peels reached: %d\n", max_peel);
2599                 }
2600             }
2601         }
2602
2603       /* Cost model #2 - if peeling may result in a remaining loop not
2604          iterating enough to be vectorized then do not peel.  Since this
2605          is a cost heuristic rather than a correctness decision, use the
2606          most likely runtime value for variable vectorization factors.  */
2607       if (do_peeling
2608           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2609         {
2610           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2611           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2612           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2613               < assumed_vf + max_peel)
2614             do_peeling = false;
2615         }
2616
2617       if (do_peeling)
2618         {
2619           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2620              If the misalignment of DR_i is identical to that of dr0 then set
2621              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2622              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2623              by the peeling factor times the element size of DR_i (MOD the
2624              vectorization factor times the size).  Otherwise, the
2625              misalignment of DR_i must be set to unknown.  */
2626           FOR_EACH_VEC_ELT (datarefs, i, dr)
2627             if (dr != dr0_info->dr)
2628               {
2629                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2630                 if (!vect_relevant_for_alignment_p (dr_info))
2631                   continue;
2632
2633                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2634               }
2635
2636           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2637           if (npeel)
2638             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2639           else
2640             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2641           SET_DR_MISALIGNMENT (dr0_info,
2642                                vect_dr_misalign_for_aligned_access (dr0_info));
2643           if (dump_enabled_p ())
2644             {
2645               dump_printf_loc (MSG_NOTE, vect_location,
2646                                "Alignment of access forced using peeling.\n");
2647               dump_printf_loc (MSG_NOTE, vect_location,
2648                                "Peeling for alignment will be applied.\n");
2649             }
2650
2651           /* The inside-loop cost will be accounted for in vectorizable_load
2652              and vectorizable_store correctly with adjusted alignments.
2653              Drop the body_cst_vec on the floor here.  */
2654           return opt_result::success ();
2655         }
2656     }
2657
2658   /* (2) Versioning to force alignment.  */
2659
2660   /* Try versioning if:
2661      1) optimize loop for speed and the cost-model is not cheap
2662      2) there is at least one unsupported misaligned data ref with an unknown
2663         misalignment, and
2664      3) all misaligned data refs with a known misalignment are supported, and
2665      4) the number of runtime alignment checks is within reason.  */
2666
2667   do_versioning
2668     = (optimize_loop_nest_for_speed_p (loop)
2669        && !loop->inner /* FORNOW */
2670        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2671
2672   if (do_versioning)
2673     {
2674       FOR_EACH_VEC_ELT (datarefs, i, dr)
2675         {
2676           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2677           if (!vect_relevant_for_alignment_p (dr_info))
2678             continue;
2679
2680           stmt_vec_info stmt_info = dr_info->stmt;
2681           if (STMT_VINFO_STRIDED_P (stmt_info))
2682             {
2683               do_versioning = false;
2684               break;
2685             }
2686
2687           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2688           bool negative = tree_int_cst_compare (DR_STEP (dr),
2689                                                 size_zero_node) < 0;
2690           poly_int64 off = 0;
2691           if (negative)
2692             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2693                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2694           int misalignment;
2695           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2696             continue;
2697
2698           enum dr_alignment_support supportable_dr_alignment
2699             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2700                                              misalignment);
2701           if (supportable_dr_alignment == dr_unaligned_unsupported)
2702             {
2703               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2704                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2705                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2706                 {
2707                   do_versioning = false;
2708                   break;
2709                 }
2710
2711               /* At present we don't support versioning for alignment
2712                  with variable VF, since there's no guarantee that the
2713                  VF is a power of two.  We could relax this if we added
2714                  a way of enforcing a power-of-two size.  */
2715               unsigned HOST_WIDE_INT size;
2716               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2717                 {
2718                   do_versioning = false;
2719                   break;
2720                 }
2721
2722               /* Forcing alignment in the first iteration is no good if
2723                  we don't keep it across iterations.  For now, just disable
2724                  versioning in this case.
2725                  ?? We could actually unroll the loop to achieve the required
2726                  overall step alignment, and forcing the alignment could be
2727                  done by doing some iterations of the non-vectorized loop.  */
2728               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2729                                * DR_STEP_ALIGNMENT (dr),
2730                                DR_TARGET_ALIGNMENT (dr_info)))
2731                 {
2732                   do_versioning = false;
2733                   break;
2734                 }
2735
2736               /* The rightmost bits of an aligned address must be zeros.
2737                  Construct the mask needed for this test.  For example,
2738                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2739                  mask must be 15 = 0xf. */
2740               int mask = size - 1;
2741
2742               /* FORNOW: use the same mask to test all potentially unaligned
2743                  references in the loop.  */
2744               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2745                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2746                 {
2747                   do_versioning = false;
2748                   break;
2749                 }
2750
2751               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2752               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2753             }
2754         }
2755
2756       /* Versioning requires at least one misaligned data reference.  */
2757       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2758         do_versioning = false;
2759       else if (!do_versioning)
2760         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2761     }
2762
2763   if (do_versioning)
2764     {
2765       const vec<stmt_vec_info> &may_misalign_stmts
2766         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2767       stmt_vec_info stmt_info;
2768
2769       /* It can now be assumed that the data references in the statements
2770          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2771          of the loop being vectorized.  */
2772       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2773         {
2774           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2775           SET_DR_MISALIGNMENT (dr_info,
2776                                vect_dr_misalign_for_aligned_access (dr_info));
2777           if (dump_enabled_p ())
2778             dump_printf_loc (MSG_NOTE, vect_location,
2779                              "Alignment of access forced using versioning.\n");
2780         }
2781
2782       if (dump_enabled_p ())
2783         dump_printf_loc (MSG_NOTE, vect_location,
2784                          "Versioning for alignment will be applied.\n");
2785
2786       /* Peeling and versioning can't be done together at this time.  */
2787       gcc_assert (! (do_peeling && do_versioning));
2788
2789       return opt_result::success ();
2790     }
2791
2792   /* This point is reached if neither peeling nor versioning is being done.  */
2793   gcc_assert (! (do_peeling || do_versioning));
2794
2795   return opt_result::success ();
2796 }
2797
2798
2799 /* Function vect_analyze_data_refs_alignment
2800
2801    Analyze the alignment of the data-references in the loop.
2802    Return FALSE if a data reference is found that cannot be vectorized.  */
2803
2804 opt_result
2805 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2806 {
2807   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2808
2809   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2810   struct data_reference *dr;
2811   unsigned int i;
2812
2813   vect_record_base_alignments (loop_vinfo);
2814   FOR_EACH_VEC_ELT (datarefs, i, dr)
2815     {
2816       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2817       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2818         {
2819           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2820               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2821             continue;
2822           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2823                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2824         }
2825     }
2826
2827   return opt_result::success ();
2828 }
2829
2830
2831 /* Analyze alignment of DRs of stmts in NODE.  */
2832
2833 static bool
2834 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2835 {
2836   /* Alignment is maintained in the first element of the group.  */
2837   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2838   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2839   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2840   tree vectype = SLP_TREE_VECTYPE (node);
2841   poly_uint64 vector_alignment
2842     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2843                  BITS_PER_UNIT);
2844   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2845     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2846   /* Re-analyze alignment when we're facing a vectorization with a bigger
2847      alignment requirement.  */
2848   else if (known_lt (dr_info->target_alignment, vector_alignment))
2849     {
2850       poly_uint64 old_target_alignment = dr_info->target_alignment;
2851       int old_misalignment = dr_info->misalignment;
2852       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2853       /* But keep knowledge about a smaller alignment.  */
2854       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2855           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2856         {
2857           dr_info->target_alignment = old_target_alignment;
2858           dr_info->misalignment = old_misalignment;
2859         }
2860     }
2861   /* When we ever face unordered target alignments the first one wins in terms
2862      of analyzing and the other will become unknown in dr_misalignment.  */
2863   return true;
2864 }
2865
2866 /* Function vect_slp_analyze_instance_alignment
2867
2868    Analyze the alignment of the data-references in the SLP instance.
2869    Return FALSE if a data reference is found that cannot be vectorized.  */
2870
2871 bool
2872 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2873                                                 slp_instance instance)
2874 {
2875   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2876
2877   slp_tree node;
2878   unsigned i;
2879   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2880     if (! vect_slp_analyze_node_alignment (vinfo, node))
2881       return false;
2882
2883   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2884       && ! vect_slp_analyze_node_alignment
2885              (vinfo, SLP_INSTANCE_TREE (instance)))
2886     return false;
2887
2888   return true;
2889 }
2890
2891
2892 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2893    accesses of legal size, step, etc.  Detect gaps, single element
2894    interleaving, and other special cases. Set grouped access info.
2895    Collect groups of strided stores for further use in SLP analysis.
2896    Worker for vect_analyze_group_access.  */
2897
2898 static bool
2899 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2900 {
2901   data_reference *dr = dr_info->dr;
2902   tree step = DR_STEP (dr);
2903   tree scalar_type = TREE_TYPE (DR_REF (dr));
2904   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2905   stmt_vec_info stmt_info = dr_info->stmt;
2906   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2907   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2908   HOST_WIDE_INT dr_step = -1;
2909   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2910   bool slp_impossible = false;
2911
2912   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2913      size of the interleaving group (including gaps).  */
2914   if (tree_fits_shwi_p (step))
2915     {
2916       dr_step = tree_to_shwi (step);
2917       /* Check that STEP is a multiple of type size.  Otherwise there is
2918          a non-element-sized gap at the end of the group which we
2919          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2920          ???  As we can handle non-constant step fine here we should
2921          simply remove uses of DR_GROUP_GAP between the last and first
2922          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2923          simply not include that gap.  */
2924       if ((dr_step % type_size) != 0)
2925         {
2926           if (dump_enabled_p ())
2927             dump_printf_loc (MSG_NOTE, vect_location,
2928                              "Step %T is not a multiple of the element size"
2929                              " for %T\n",
2930                              step, DR_REF (dr));
2931           return false;
2932         }
2933       groupsize = absu_hwi (dr_step) / type_size;
2934     }
2935   else
2936     groupsize = 0;
2937
2938   /* Not consecutive access is possible only if it is a part of interleaving.  */
2939   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2940     {
2941       /* Check if it this DR is a part of interleaving, and is a single
2942          element of the group that is accessed in the loop.  */
2943
2944       /* Gaps are supported only for loads. STEP must be a multiple of the type
2945          size.  */
2946       if (DR_IS_READ (dr)
2947           && (dr_step % type_size) == 0
2948           && groupsize > 0
2949           /* This could be UINT_MAX but as we are generating code in a very
2950              inefficient way we have to cap earlier.
2951              See PR91403 for example.  */
2952           && groupsize <= 4096)
2953         {
2954           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2955           DR_GROUP_SIZE (stmt_info) = groupsize;
2956           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2957           if (dump_enabled_p ())
2958             dump_printf_loc (MSG_NOTE, vect_location,
2959                              "Detected single element interleaving %T"
2960                              " step %T\n",
2961                              DR_REF (dr), step);
2962
2963           return true;
2964         }
2965
2966       if (dump_enabled_p ())
2967         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2968                          "not consecutive access %G", stmt_info->stmt);
2969
2970       if (bb_vinfo)
2971         {
2972           /* Mark the statement as unvectorizable.  */
2973           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2974           return true;
2975         }
2976
2977       if (dump_enabled_p ())
2978         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2979       STMT_VINFO_STRIDED_P (stmt_info) = true;
2980       return true;
2981     }
2982
2983   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2984     {
2985       /* First stmt in the interleaving chain. Check the chain.  */
2986       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2987       struct data_reference *data_ref = dr;
2988       unsigned int count = 1;
2989       tree prev_init = DR_INIT (data_ref);
2990       HOST_WIDE_INT diff, gaps = 0;
2991
2992       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2993       while (next)
2994         {
2995           /* We never have the same DR multiple times.  */
2996           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2997                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2998
2999           data_ref = STMT_VINFO_DATA_REF (next);
3000
3001           /* All group members have the same STEP by construction.  */
3002           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3003
3004           /* Check that the distance between two accesses is equal to the type
3005              size. Otherwise, we have gaps.  */
3006           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3007                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3008           if (diff < 1 || diff > UINT_MAX)
3009             {
3010               /* For artificial testcases with array accesses with large
3011                  constant indices we can run into overflow issues which
3012                  can end up fooling the groupsize constraint below so
3013                  check the individual gaps (which are represented as
3014                  unsigned int) as well.  */
3015               if (dump_enabled_p ())
3016                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017                                  "interleaved access with gap larger "
3018                                  "than representable\n");
3019               return false;
3020             }
3021           if (diff != 1)
3022             {
3023               /* FORNOW: SLP of accesses with gaps is not supported.  */
3024               slp_impossible = true;
3025               if (DR_IS_WRITE (data_ref))
3026                 {
3027                   if (dump_enabled_p ())
3028                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029                                      "interleaved store with gaps\n");
3030                   return false;
3031                 }
3032
3033               gaps += diff - 1;
3034             }
3035
3036           last_accessed_element += diff;
3037
3038           /* Store the gap from the previous member of the group. If there is no
3039              gap in the access, DR_GROUP_GAP is always 1.  */
3040           DR_GROUP_GAP (next) = diff;
3041
3042           prev_init = DR_INIT (data_ref);
3043           next = DR_GROUP_NEXT_ELEMENT (next);
3044           /* Count the number of data-refs in the chain.  */
3045           count++;
3046         }
3047
3048       if (groupsize == 0)
3049         groupsize = count + gaps;
3050
3051       /* This could be UINT_MAX but as we are generating code in a very
3052          inefficient way we have to cap earlier.  See PR78699 for example.  */
3053       if (groupsize > 4096)
3054         {
3055           if (dump_enabled_p ())
3056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3057                              "group is too large\n");
3058           return false;
3059         }
3060
3061       /* Check that the size of the interleaving is equal to count for stores,
3062          i.e., that there are no gaps.  */
3063       if (groupsize != count
3064           && !DR_IS_READ (dr))
3065         {
3066           groupsize = count;
3067           STMT_VINFO_STRIDED_P (stmt_info) = true;
3068         }
3069
3070       /* If there is a gap after the last load in the group it is the
3071          difference between the groupsize and the last accessed
3072          element.
3073          When there is no gap, this difference should be 0.  */
3074       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3075
3076       DR_GROUP_SIZE (stmt_info) = groupsize;
3077       if (dump_enabled_p ())
3078         {
3079           dump_printf_loc (MSG_NOTE, vect_location,
3080                            "Detected interleaving ");
3081           if (DR_IS_READ (dr))
3082             dump_printf (MSG_NOTE, "load ");
3083           else if (STMT_VINFO_STRIDED_P (stmt_info))
3084             dump_printf (MSG_NOTE, "strided store ");
3085           else
3086             dump_printf (MSG_NOTE, "store ");
3087           dump_printf (MSG_NOTE, "of size %u\n",
3088                        (unsigned)groupsize);
3089           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3090           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3091           while (next)
3092             {
3093               if (DR_GROUP_GAP (next) != 1)
3094                 dump_printf_loc (MSG_NOTE, vect_location,
3095                                  "\t<gap of %d elements>\n",
3096                                  DR_GROUP_GAP (next) - 1);
3097               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3098               next = DR_GROUP_NEXT_ELEMENT (next);
3099             }
3100           if (DR_GROUP_GAP (stmt_info) != 0)
3101             dump_printf_loc (MSG_NOTE, vect_location,
3102                              "\t<gap of %d elements>\n",
3103                              DR_GROUP_GAP (stmt_info));
3104         }
3105
3106       /* SLP: create an SLP data structure for every interleaving group of
3107          stores for further analysis in vect_analyse_slp.  */
3108       if (DR_IS_WRITE (dr) && !slp_impossible)
3109         {
3110           if (loop_vinfo)
3111             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3112           if (bb_vinfo)
3113             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3114         }
3115     }
3116
3117   return true;
3118 }
3119
3120 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3121    accesses of legal size, step, etc.  Detect gaps, single element
3122    interleaving, and other special cases. Set grouped access info.
3123    Collect groups of strided stores for further use in SLP analysis.  */
3124
3125 static bool
3126 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3127 {
3128   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3129     {
3130       /* Dissolve the group if present.  */
3131       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3132       while (stmt_info)
3133         {
3134           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3135           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3136           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3137           stmt_info = next;
3138         }
3139       return false;
3140     }
3141   return true;
3142 }
3143
3144 /* Analyze the access pattern of the data-reference DR_INFO.
3145    In case of non-consecutive accesses call vect_analyze_group_access() to
3146    analyze groups of accesses.  */
3147
3148 static bool
3149 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3150 {
3151   data_reference *dr = dr_info->dr;
3152   tree step = DR_STEP (dr);
3153   tree scalar_type = TREE_TYPE (DR_REF (dr));
3154   stmt_vec_info stmt_info = dr_info->stmt;
3155   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3156   class loop *loop = NULL;
3157
3158   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3159     return true;
3160
3161   if (loop_vinfo)
3162     loop = LOOP_VINFO_LOOP (loop_vinfo);
3163
3164   if (loop_vinfo && !step)
3165     {
3166       if (dump_enabled_p ())
3167         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3168                          "bad data-ref access in loop\n");
3169       return false;
3170     }
3171
3172   /* Allow loads with zero step in inner-loop vectorization.  */
3173   if (loop_vinfo && integer_zerop (step))
3174     {
3175       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3176       if (!nested_in_vect_loop_p (loop, stmt_info))
3177         return DR_IS_READ (dr);
3178       /* Allow references with zero step for outer loops marked
3179          with pragma omp simd only - it guarantees absence of
3180          loop-carried dependencies between inner loop iterations.  */
3181       if (loop->safelen < 2)
3182         {
3183           if (dump_enabled_p ())
3184             dump_printf_loc (MSG_NOTE, vect_location,
3185                              "zero step in inner loop of nest\n");
3186           return false;
3187         }
3188     }
3189
3190   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3191     {
3192       /* Interleaved accesses are not yet supported within outer-loop
3193         vectorization for references in the inner-loop.  */
3194       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3195
3196       /* For the rest of the analysis we use the outer-loop step.  */
3197       step = STMT_VINFO_DR_STEP (stmt_info);
3198       if (integer_zerop (step))
3199         {
3200           if (dump_enabled_p ())
3201             dump_printf_loc (MSG_NOTE, vect_location,
3202                              "zero step in outer loop.\n");
3203           return DR_IS_READ (dr);
3204         }
3205     }
3206
3207   /* Consecutive?  */
3208   if (TREE_CODE (step) == INTEGER_CST)
3209     {
3210       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3211       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3212           || (dr_step < 0
3213               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3214         {
3215           /* Mark that it is not interleaving.  */
3216           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3217           return true;
3218         }
3219     }
3220
3221   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3222     {
3223       if (dump_enabled_p ())
3224         dump_printf_loc (MSG_NOTE, vect_location,
3225                          "grouped access in outer loop.\n");
3226       return false;
3227     }
3228
3229
3230   /* Assume this is a DR handled by non-constant strided load case.  */
3231   if (TREE_CODE (step) != INTEGER_CST)
3232     return (STMT_VINFO_STRIDED_P (stmt_info)
3233             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3234                 || vect_analyze_group_access (vinfo, dr_info)));
3235
3236   /* Not consecutive access - check if it's a part of interleaving group.  */
3237   return vect_analyze_group_access (vinfo, dr_info);
3238 }
3239
3240 /* Compare two data-references DRA and DRB to group them into chunks
3241    suitable for grouping.  */
3242
3243 static int
3244 dr_group_sort_cmp (const void *dra_, const void *drb_)
3245 {
3246   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3247   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3248   data_reference_p dra = dra_info->dr;
3249   data_reference_p drb = drb_info->dr;
3250   int cmp;
3251
3252   /* Stabilize sort.  */
3253   if (dra == drb)
3254     return 0;
3255
3256   /* Different group IDs lead never belong to the same group.  */
3257   if (dra_info->group != drb_info->group)
3258     return dra_info->group < drb_info->group ? -1 : 1;
3259
3260   /* Ordering of DRs according to base.  */
3261   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3262                                DR_BASE_ADDRESS (drb));
3263   if (cmp != 0)
3264     return cmp;
3265
3266   /* And according to DR_OFFSET.  */
3267   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3268   if (cmp != 0)
3269     return cmp;
3270
3271   /* Put reads before writes.  */
3272   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3273     return DR_IS_READ (dra) ? -1 : 1;
3274
3275   /* Then sort after access size.  */
3276   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3277                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3278   if (cmp != 0)
3279     return cmp;
3280
3281   /* And after step.  */
3282   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3283   if (cmp != 0)
3284     return cmp;
3285
3286   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3287   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3288   if (cmp == 0)
3289     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3290   return cmp;
3291 }
3292
3293 /* If OP is the result of a conversion, return the unconverted value,
3294    otherwise return null.  */
3295
3296 static tree
3297 strip_conversion (tree op)
3298 {
3299   if (TREE_CODE (op) != SSA_NAME)
3300     return NULL_TREE;
3301   gimple *stmt = SSA_NAME_DEF_STMT (op);
3302   if (!is_gimple_assign (stmt)
3303       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3304     return NULL_TREE;
3305   return gimple_assign_rhs1 (stmt);
3306 }
3307
3308 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3309    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3310    be grouped in SLP mode.  */
3311
3312 static bool
3313 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3314                    bool allow_slp_p)
3315 {
3316   if (gimple_assign_single_p (stmt1_info->stmt))
3317     return gimple_assign_single_p (stmt2_info->stmt);
3318
3319   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3320   if (call1 && gimple_call_internal_p (call1))
3321     {
3322       /* Check for two masked loads or two masked stores.  */
3323       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3324       if (!call2 || !gimple_call_internal_p (call2))
3325         return false;
3326       internal_fn ifn = gimple_call_internal_fn (call1);
3327       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3328         return false;
3329       if (ifn != gimple_call_internal_fn (call2))
3330         return false;
3331
3332       /* Check that the masks are the same.  Cope with casts of masks,
3333          like those created by build_mask_conversion.  */
3334       tree mask1 = gimple_call_arg (call1, 2);
3335       tree mask2 = gimple_call_arg (call2, 2);
3336       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3337         {
3338           mask1 = strip_conversion (mask1);
3339           if (!mask1)
3340             return false;
3341           mask2 = strip_conversion (mask2);
3342           if (!mask2)
3343             return false;
3344           if (!operand_equal_p (mask1, mask2, 0))
3345             return false;
3346         }
3347       return true;
3348     }
3349
3350   return false;
3351 }
3352
3353 /* Function vect_analyze_data_ref_accesses.
3354
3355    Analyze the access pattern of all the data references in the loop.
3356
3357    FORNOW: the only access pattern that is considered vectorizable is a
3358            simple step 1 (consecutive) access.
3359
3360    FORNOW: handle only arrays and pointer accesses.  */
3361
3362 opt_result
3363 vect_analyze_data_ref_accesses (vec_info *vinfo,
3364                                 vec<int> *dataref_groups)
3365 {
3366   unsigned int i;
3367   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3368
3369   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3370
3371   if (datarefs.is_empty ())
3372     return opt_result::success ();
3373
3374   /* Sort the array of datarefs to make building the interleaving chains
3375      linear.  Don't modify the original vector's order, it is needed for
3376      determining what dependencies are reversed.  */
3377   vec<dr_vec_info *> datarefs_copy;
3378   datarefs_copy.create (datarefs.length ());
3379   for (unsigned i = 0; i < datarefs.length (); i++)
3380     {
3381       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3382       /* If the caller computed DR grouping use that, otherwise group by
3383          basic blocks.  */
3384       if (dataref_groups)
3385         dr_info->group = (*dataref_groups)[i];
3386       else
3387         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3388       datarefs_copy.quick_push (dr_info);
3389     }
3390   datarefs_copy.qsort (dr_group_sort_cmp);
3391   hash_set<stmt_vec_info> to_fixup;
3392
3393   /* Build the interleaving chains.  */
3394   for (i = 0; i < datarefs_copy.length () - 1;)
3395     {
3396       dr_vec_info *dr_info_a = datarefs_copy[i];
3397       data_reference_p dra = dr_info_a->dr;
3398       int dra_group_id = dr_info_a->group;
3399       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3400       stmt_vec_info lastinfo = NULL;
3401       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3402           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3403         {
3404           ++i;
3405           continue;
3406         }
3407       for (i = i + 1; i < datarefs_copy.length (); ++i)
3408         {
3409           dr_vec_info *dr_info_b = datarefs_copy[i];
3410           data_reference_p drb = dr_info_b->dr;
3411           int drb_group_id = dr_info_b->group;
3412           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3413           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3414               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3415             break;
3416
3417           /* ???  Imperfect sorting (non-compatible types, non-modulo
3418              accesses, same accesses) can lead to a group to be artificially
3419              split here as we don't just skip over those.  If it really
3420              matters we can push those to a worklist and re-iterate
3421              over them.  The we can just skip ahead to the next DR here.  */
3422
3423           /* DRs in a different DR group should not be put into the same
3424              interleaving group.  */
3425           if (dra_group_id != drb_group_id)
3426             break;
3427
3428           /* Check that the data-refs have same first location (except init)
3429              and they are both either store or load (not load and store,
3430              not masked loads or stores).  */
3431           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3432               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3433                                         DR_BASE_ADDRESS (drb)) != 0
3434               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3435               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3436             break;
3437
3438           /* Check that the data-refs have the same constant size.  */
3439           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3440           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3441           if (!tree_fits_uhwi_p (sza)
3442               || !tree_fits_uhwi_p (szb)
3443               || !tree_int_cst_equal (sza, szb))
3444             break;
3445
3446           /* Check that the data-refs have the same step.  */
3447           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3448             break;
3449
3450           /* Check the types are compatible.
3451              ???  We don't distinguish this during sorting.  */
3452           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3453                                    TREE_TYPE (DR_REF (drb))))
3454             break;
3455
3456           /* Check that the DR_INITs are compile-time constants.  */
3457           if (!tree_fits_shwi_p (DR_INIT (dra))
3458               || !tree_fits_shwi_p (DR_INIT (drb)))
3459             break;
3460
3461           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3462              just hold extra information.  */
3463           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3464               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3465               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3466             break;
3467
3468           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3469           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3470           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3471           HOST_WIDE_INT init_prev
3472             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3473           gcc_assert (init_a <= init_b
3474                       && init_a <= init_prev
3475                       && init_prev <= init_b);
3476
3477           /* Do not place the same access in the interleaving chain twice.  */
3478           if (init_b == init_prev)
3479             {
3480               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3481                           < gimple_uid (DR_STMT (drb)));
3482               /* Simply link in duplicates and fix up the chain below.  */
3483             }
3484           else
3485             {
3486               /* If init_b == init_a + the size of the type * k, we have an
3487                  interleaving, and DRA is accessed before DRB.  */
3488               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3489               if (type_size_a == 0
3490                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3491                       % type_size_a != 0))
3492                 break;
3493
3494               /* If we have a store, the accesses are adjacent.  This splits
3495                  groups into chunks we support (we don't support vectorization
3496                  of stores with gaps).  */
3497               if (!DR_IS_READ (dra)
3498                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3499                       != type_size_a))
3500                 break;
3501
3502               /* If the step (if not zero or non-constant) is smaller than the
3503                  difference between data-refs' inits this splits groups into
3504                  suitable sizes.  */
3505               if (tree_fits_shwi_p (DR_STEP (dra)))
3506                 {
3507                   unsigned HOST_WIDE_INT step
3508                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3509                   if (step != 0
3510                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3511                     break;
3512                 }
3513             }
3514
3515           if (dump_enabled_p ())
3516             dump_printf_loc (MSG_NOTE, vect_location,
3517                              DR_IS_READ (dra)
3518                              ? "Detected interleaving load %T and %T\n"
3519                              : "Detected interleaving store %T and %T\n",
3520                              DR_REF (dra), DR_REF (drb));
3521
3522           /* Link the found element into the group list.  */
3523           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3524             {
3525               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3526               lastinfo = stmtinfo_a;
3527             }
3528           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3529           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3530           lastinfo = stmtinfo_b;
3531
3532           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3533             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3534
3535           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3536             dump_printf_loc (MSG_NOTE, vect_location,
3537                              "Load suitable for SLP vectorization only.\n");
3538
3539           if (init_b == init_prev
3540               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3541               && dump_enabled_p ())
3542             dump_printf_loc (MSG_NOTE, vect_location,
3543                              "Queuing group with duplicate access for fixup\n");
3544         }
3545     }
3546
3547   /* Fixup groups with duplicate entries by splitting it.  */
3548   while (1)
3549     {
3550       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3551       if (!(it != to_fixup.end ()))
3552         break;
3553       stmt_vec_info grp = *it;
3554       to_fixup.remove (grp);
3555
3556       /* Find the earliest duplicate group member.  */
3557       unsigned first_duplicate = -1u;
3558       stmt_vec_info next, g = grp;
3559       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3560         {
3561           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3562                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3563               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3564             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3565           g = next;
3566         }
3567       if (first_duplicate == -1U)
3568         continue;
3569
3570       /* Then move all stmts after the first duplicate to a new group.
3571          Note this is a heuristic but one with the property that *it
3572          is fixed up completely.  */
3573       g = grp;
3574       stmt_vec_info newgroup = NULL, ng = grp;
3575       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3576         {
3577           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3578             {
3579               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3580               if (!newgroup)
3581                 newgroup = next;
3582               else
3583                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3584               ng = next;
3585               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3586             }
3587           else
3588             g = DR_GROUP_NEXT_ELEMENT (g);
3589         }
3590       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3591
3592       /* Fixup the new group which still may contain duplicates.  */
3593       to_fixup.add (newgroup);
3594     }
3595
3596   dr_vec_info *dr_info;
3597   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3598     {
3599       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3600           && !vect_analyze_data_ref_access (vinfo, dr_info))
3601         {
3602           if (dump_enabled_p ())
3603             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3604                              "not vectorized: complicated access pattern.\n");
3605
3606           if (is_a <bb_vec_info> (vinfo))
3607             {
3608               /* Mark the statement as not vectorizable.  */
3609               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3610               continue;
3611             }
3612           else
3613             {
3614               datarefs_copy.release ();
3615               return opt_result::failure_at (dr_info->stmt->stmt,
3616                                              "not vectorized:"
3617                                              " complicated access pattern.\n");
3618             }
3619         }
3620     }
3621
3622   datarefs_copy.release ();
3623   return opt_result::success ();
3624 }
3625
3626 /* Function vect_vfa_segment_size.
3627
3628    Input:
3629      DR_INFO: The data reference.
3630      LENGTH_FACTOR: segment length to consider.
3631
3632    Return a value suitable for the dr_with_seg_len::seg_len field.
3633    This is the "distance travelled" by the pointer from the first
3634    iteration in the segment to the last.  Note that it does not include
3635    the size of the access; in effect it only describes the first byte.  */
3636
3637 static tree
3638 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3639 {
3640   length_factor = size_binop (MINUS_EXPR,
3641                               fold_convert (sizetype, length_factor),
3642                               size_one_node);
3643   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3644                      length_factor);
3645 }
3646
3647 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3648    gives the worst-case number of bytes covered by the segment.  */
3649
3650 static unsigned HOST_WIDE_INT
3651 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3652 {
3653   stmt_vec_info stmt_vinfo = dr_info->stmt;
3654   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3655   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3656   unsigned HOST_WIDE_INT access_size = ref_size;
3657   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3658     {
3659       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3660       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3661     }
3662   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3663   int misalignment;
3664   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3665       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3666       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3667           == dr_explicit_realign_optimized))
3668     {
3669       /* We might access a full vector's worth.  */
3670       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3671     }
3672   return access_size;
3673 }
3674
3675 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3676    describes.  */
3677
3678 static unsigned int
3679 vect_vfa_align (dr_vec_info *dr_info)
3680 {
3681   return dr_alignment (dr_info->dr);
3682 }
3683
3684 /* Function vect_no_alias_p.
3685
3686    Given data references A and B with equal base and offset, see whether
3687    the alias relation can be decided at compilation time.  Return 1 if
3688    it can and the references alias, 0 if it can and the references do
3689    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3690    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3691    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3692
3693 static int
3694 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3695                          tree segment_length_a, tree segment_length_b,
3696                          unsigned HOST_WIDE_INT access_size_a,
3697                          unsigned HOST_WIDE_INT access_size_b)
3698 {
3699   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3700   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3701   poly_uint64 const_length_a;
3702   poly_uint64 const_length_b;
3703
3704   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3705      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3706      [a, a+12) */
3707   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3708     {
3709       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3710       offset_a -= const_length_a;
3711     }
3712   else
3713     const_length_a = tree_to_poly_uint64 (segment_length_a);
3714   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3715     {
3716       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3717       offset_b -= const_length_b;
3718     }
3719   else
3720     const_length_b = tree_to_poly_uint64 (segment_length_b);
3721
3722   const_length_a += access_size_a;
3723   const_length_b += access_size_b;
3724
3725   if (ranges_known_overlap_p (offset_a, const_length_a,
3726                               offset_b, const_length_b))
3727     return 1;
3728
3729   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3730                                offset_b, const_length_b))
3731     return 0;
3732
3733   return -1;
3734 }
3735
3736 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3737    in DDR is >= VF.  */
3738
3739 static bool
3740 dependence_distance_ge_vf (data_dependence_relation *ddr,
3741                            unsigned int loop_depth, poly_uint64 vf)
3742 {
3743   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3744       || DDR_NUM_DIST_VECTS (ddr) == 0)
3745     return false;
3746
3747   /* If the dependence is exact, we should have limited the VF instead.  */
3748   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3749
3750   unsigned int i;
3751   lambda_vector dist_v;
3752   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3753     {
3754       HOST_WIDE_INT dist = dist_v[loop_depth];
3755       if (dist != 0
3756           && !(dist > 0 && DDR_REVERSED_P (ddr))
3757           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3758         return false;
3759     }
3760
3761   if (dump_enabled_p ())
3762     dump_printf_loc (MSG_NOTE, vect_location,
3763                      "dependence distance between %T and %T is >= VF\n",
3764                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3765
3766   return true;
3767 }
3768
3769 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3770
3771 static void
3772 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3773 {
3774   dump_printf (dump_kind, "%s (%T) >= ",
3775                lower_bound.unsigned_p ? "unsigned" : "abs",
3776                lower_bound.expr);
3777   dump_dec (dump_kind, lower_bound.min_value);
3778 }
3779
3780 /* Record that the vectorized loop requires the vec_lower_bound described
3781    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3782
3783 static void
3784 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3785                         poly_uint64 min_value)
3786 {
3787   vec<vec_lower_bound> &lower_bounds
3788     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3789   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3790     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3791       {
3792         unsigned_p &= lower_bounds[i].unsigned_p;
3793         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3794         if (lower_bounds[i].unsigned_p != unsigned_p
3795             || maybe_lt (lower_bounds[i].min_value, min_value))
3796           {
3797             lower_bounds[i].unsigned_p = unsigned_p;
3798             lower_bounds[i].min_value = min_value;
3799             if (dump_enabled_p ())
3800               {
3801                 dump_printf_loc (MSG_NOTE, vect_location,
3802                                  "updating run-time check to ");
3803                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3804                 dump_printf (MSG_NOTE, "\n");
3805               }
3806           }
3807         return;
3808       }
3809
3810   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3811   if (dump_enabled_p ())
3812     {
3813       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3814       dump_lower_bound (MSG_NOTE, lower_bound);
3815       dump_printf (MSG_NOTE, "\n");
3816     }
3817   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3818 }
3819
3820 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3821    will span fewer than GAP bytes.  */
3822
3823 static bool
3824 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3825                   poly_int64 gap)
3826 {
3827   stmt_vec_info stmt_info = dr_info->stmt;
3828   HOST_WIDE_INT count
3829     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3830   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3831     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3832   return (estimated_poly_value (gap)
3833           <= count * vect_get_scalar_dr_size (dr_info));
3834 }
3835
3836 /* Return true if we know that there is no alias between DR_INFO_A and
3837    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3838    When returning true, set *LOWER_BOUND_OUT to this N.  */
3839
3840 static bool
3841 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3842                                 poly_uint64 *lower_bound_out)
3843 {
3844   /* Check that there is a constant gap of known sign between DR_A
3845      and DR_B.  */
3846   data_reference *dr_a = dr_info_a->dr;
3847   data_reference *dr_b = dr_info_b->dr;
3848   poly_int64 init_a, init_b;
3849   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3850       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3851       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3852       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3853       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3854       || !ordered_p (init_a, init_b))
3855     return false;
3856
3857   /* Sort DR_A and DR_B by the address they access.  */
3858   if (maybe_lt (init_b, init_a))
3859     {
3860       std::swap (init_a, init_b);
3861       std::swap (dr_info_a, dr_info_b);
3862       std::swap (dr_a, dr_b);
3863     }
3864
3865   /* If the two accesses could be dependent within a scalar iteration,
3866      make sure that we'd retain their order.  */
3867   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3868       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3869     return false;
3870
3871   /* There is no alias if abs (DR_STEP) is greater than or equal to
3872      the bytes spanned by the combination of the two accesses.  */
3873   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3874   return true;
3875 }
3876
3877 /* Function vect_prune_runtime_alias_test_list.
3878
3879    Prune a list of ddrs to be tested at run-time by versioning for alias.
3880    Merge several alias checks into one if possible.
3881    Return FALSE if resulting list of ddrs is longer then allowed by
3882    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3883
3884 opt_result
3885 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3886 {
3887   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3888   hash_set <tree_pair_hash> compared_objects;
3889
3890   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3891   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3892     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3893   const vec<vec_object_pair> &check_unequal_addrs
3894     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3895   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3896   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3897
3898   ddr_p ddr;
3899   unsigned int i;
3900   tree length_factor;
3901
3902   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3903
3904   /* Step values are irrelevant for aliasing if the number of vector
3905      iterations is equal to the number of scalar iterations (which can
3906      happen for fully-SLP loops).  */
3907   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3908
3909   if (!vf_one_p)
3910     {
3911       /* Convert the checks for nonzero steps into bound tests.  */
3912       tree value;
3913       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3914         vect_check_lower_bound (loop_vinfo, value, true, 1);
3915     }
3916
3917   if (may_alias_ddrs.is_empty ())
3918     return opt_result::success ();
3919
3920   comp_alias_ddrs.create (may_alias_ddrs.length ());
3921
3922   unsigned int loop_depth
3923     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3924                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3925
3926   /* First, we collect all data ref pairs for aliasing checks.  */
3927   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3928     {
3929       poly_uint64 lower_bound;
3930       tree segment_length_a, segment_length_b;
3931       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3932       unsigned int align_a, align_b;
3933
3934       /* Ignore the alias if the VF we chose ended up being no greater
3935          than the dependence distance.  */
3936       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3937         continue;
3938
3939       if (DDR_OBJECT_A (ddr))
3940         {
3941           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3942           if (!compared_objects.add (new_pair))
3943             {
3944               if (dump_enabled_p ())
3945                 dump_printf_loc (MSG_NOTE, vect_location,
3946                                  "checking that %T and %T"
3947                                  " have different addresses\n",
3948                                  new_pair.first, new_pair.second);
3949               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3950             }
3951           continue;
3952         }
3953
3954       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3955       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3956
3957       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3958       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3959
3960       bool preserves_scalar_order_p
3961         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3962       bool ignore_step_p
3963           = (vf_one_p
3964              && (preserves_scalar_order_p
3965                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3966                                      DR_STEP (dr_info_b->dr))));
3967
3968       /* Skip the pair if inter-iteration dependencies are irrelevant
3969          and intra-iteration dependencies are guaranteed to be honored.  */
3970       if (ignore_step_p
3971           && (preserves_scalar_order_p
3972               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3973                                                  &lower_bound)))
3974         {
3975           if (dump_enabled_p ())
3976             dump_printf_loc (MSG_NOTE, vect_location,
3977                              "no need for alias check between "
3978                              "%T and %T when VF is 1\n",
3979                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3980           continue;
3981         }
3982
3983       /* See whether we can handle the alias using a bounds check on
3984          the step, and whether that's likely to be the best approach.
3985          (It might not be, for example, if the minimum step is much larger
3986          than the number of bytes handled by one vector iteration.)  */
3987       if (!ignore_step_p
3988           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3989           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3990                                              &lower_bound)
3991           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3992               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3993         {
3994           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3995           if (dump_enabled_p ())
3996             {
3997               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3998                                "%T and %T when the step %T is outside ",
3999                                DR_REF (dr_info_a->dr),
4000                                DR_REF (dr_info_b->dr),
4001                                DR_STEP (dr_info_a->dr));
4002               if (unsigned_p)
4003                 dump_printf (MSG_NOTE, "[0");
4004               else
4005                 {
4006                   dump_printf (MSG_NOTE, "(");
4007                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4008                 }
4009               dump_printf (MSG_NOTE, ", ");
4010               dump_dec (MSG_NOTE, lower_bound);
4011               dump_printf (MSG_NOTE, ")\n");
4012             }
4013           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4014                                   unsigned_p, lower_bound);
4015           continue;
4016         }
4017
4018       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4019       if (dr_group_first_a)
4020         {
4021           stmt_info_a = dr_group_first_a;
4022           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4023         }
4024
4025       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4026       if (dr_group_first_b)
4027         {
4028           stmt_info_b = dr_group_first_b;
4029           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4030         }
4031
4032       if (ignore_step_p)
4033         {
4034           segment_length_a = size_zero_node;
4035           segment_length_b = size_zero_node;
4036         }
4037       else
4038         {
4039           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4040                                 DR_STEP (dr_info_b->dr), 0))
4041             length_factor = scalar_loop_iters;
4042           else
4043             length_factor = size_int (vect_factor);
4044           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4045           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4046         }
4047       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4048       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4049       align_a = vect_vfa_align (dr_info_a);
4050       align_b = vect_vfa_align (dr_info_b);
4051
4052       /* See whether the alias is known at compilation time.  */
4053       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4054                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4055           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4056                               DR_OFFSET (dr_info_b->dr), 0)
4057           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4058           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4059           && poly_int_tree_p (segment_length_a)
4060           && poly_int_tree_p (segment_length_b))
4061         {
4062           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4063                                              segment_length_a,
4064                                              segment_length_b,
4065                                              access_size_a,
4066                                              access_size_b);
4067           if (res >= 0 && dump_enabled_p ())
4068             {
4069               dump_printf_loc (MSG_NOTE, vect_location,
4070                                "can tell at compile time that %T and %T",
4071                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4072               if (res == 0)
4073                 dump_printf (MSG_NOTE, " do not alias\n");
4074               else
4075                 dump_printf (MSG_NOTE, " alias\n");
4076             }
4077
4078           if (res == 0)
4079             continue;
4080
4081           if (res == 1)
4082             return opt_result::failure_at (stmt_info_b->stmt,
4083                                            "not vectorized:"
4084                                            " compilation time alias: %G%G",
4085                                            stmt_info_a->stmt,
4086                                            stmt_info_b->stmt);
4087         }
4088
4089       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4090                             access_size_a, align_a);
4091       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4092                             access_size_b, align_b);
4093       /* Canonicalize the order to be the one that's needed for accurate
4094          RAW, WAR and WAW flags, in cases where the data references are
4095          well-ordered.  The order doesn't really matter otherwise,
4096          but we might as well be consistent.  */
4097       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4098         std::swap (dr_a, dr_b);
4099
4100       dr_with_seg_len_pair_t dr_with_seg_len_pair
4101         (dr_a, dr_b, (preserves_scalar_order_p
4102                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4103                       : dr_with_seg_len_pair_t::REORDERED));
4104
4105       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4106     }
4107
4108   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4109
4110   unsigned int count = (comp_alias_ddrs.length ()
4111                         + check_unequal_addrs.length ());
4112
4113   if (count
4114       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4115           == VECT_COST_MODEL_VERY_CHEAP))
4116     return opt_result::failure_at
4117       (vect_location, "would need a runtime alias check\n");
4118
4119   if (dump_enabled_p ())
4120     dump_printf_loc (MSG_NOTE, vect_location,
4121                      "improved number of alias checks from %d to %d\n",
4122                      may_alias_ddrs.length (), count);
4123   unsigned limit = param_vect_max_version_for_alias_checks;
4124   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4125     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4126   if (count > limit)
4127     return opt_result::failure_at
4128       (vect_location,
4129        "number of versioning for alias run-time tests exceeds %d "
4130        "(--param vect-max-version-for-alias-checks)\n", limit);
4131
4132   return opt_result::success ();
4133 }
4134
4135 /* Check whether we can use an internal function for a gather load
4136    or scatter store.  READ_P is true for loads and false for stores.
4137    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4138    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4139    is the type of the offset that is being applied to the invariant
4140    base address.  SCALE is the amount by which the offset should
4141    be multiplied *after* it has been converted to address width.
4142
4143    Return true if the function is supported, storing the function id in
4144    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4145
4146 bool
4147 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4148                           tree vectype, tree memory_type, tree offset_type,
4149                           int scale, internal_fn *ifn_out,
4150                           tree *offset_vectype_out)
4151 {
4152   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4153   unsigned int element_bits = vector_element_bits (vectype);
4154   if (element_bits != memory_bits)
4155     /* For now the vector elements must be the same width as the
4156        memory elements.  */
4157     return false;
4158
4159   /* Work out which function we need.  */
4160   internal_fn ifn, alt_ifn, alt_ifn2;
4161   if (read_p)
4162     {
4163       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4164       alt_ifn = IFN_MASK_GATHER_LOAD;
4165       /* When target supports MASK_LEN_GATHER_LOAD, we always
4166          use MASK_LEN_GATHER_LOAD regardless whether len and
4167          mask are valid or not.  */
4168       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4169     }
4170   else
4171     {
4172       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4173       alt_ifn = IFN_MASK_SCATTER_STORE;
4174       /* When target supports MASK_LEN_SCATTER_STORE, we always
4175          use MASK_LEN_SCATTER_STORE regardless whether len and
4176          mask are valid or not.  */
4177       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4178     }
4179
4180   for (;;)
4181     {
4182       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4183       if (!offset_vectype)
4184         return false;
4185
4186       /* Test whether the target supports this combination.  */
4187       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4188                                                   offset_vectype, scale))
4189         {
4190           *ifn_out = ifn;
4191           *offset_vectype_out = offset_vectype;
4192           return true;
4193         }
4194       else if (!masked_p
4195                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4196                                                           memory_type,
4197                                                           offset_vectype,
4198                                                           scale))
4199         {
4200           *ifn_out = alt_ifn;
4201           *offset_vectype_out = offset_vectype;
4202           return true;
4203         }
4204       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4205                                                        memory_type,
4206                                                        offset_vectype, scale))
4207         {
4208           *ifn_out = alt_ifn2;
4209           *offset_vectype_out = offset_vectype;
4210           return true;
4211         }
4212
4213       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4214           && TYPE_PRECISION (offset_type) >= element_bits)
4215         return false;
4216
4217       offset_type = build_nonstandard_integer_type
4218         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4219     }
4220 }
4221
4222 /* STMT_INFO is a call to an internal gather load or scatter store function.
4223    Describe the operation in INFO.  */
4224
4225 static void
4226 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4227                                    gather_scatter_info *info)
4228 {
4229   gcall *call = as_a <gcall *> (stmt_info->stmt);
4230   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4231   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4232
4233   info->ifn = gimple_call_internal_fn (call);
4234   info->decl = NULL_TREE;
4235   info->base = gimple_call_arg (call, 0);
4236   info->offset = gimple_call_arg (call, 1);
4237   info->offset_dt = vect_unknown_def_type;
4238   info->offset_vectype = NULL_TREE;
4239   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4240   info->element_type = TREE_TYPE (vectype);
4241   info->memory_type = TREE_TYPE (DR_REF (dr));
4242 }
4243
4244 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4245    gather load or scatter store.  Describe the operation in *INFO if so.  */
4246
4247 bool
4248 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4249                            gather_scatter_info *info)
4250 {
4251   HOST_WIDE_INT scale = 1;
4252   poly_int64 pbitpos, pbitsize;
4253   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4254   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4255   tree offtype = NULL_TREE;
4256   tree decl = NULL_TREE, base, off;
4257   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4258   tree memory_type = TREE_TYPE (DR_REF (dr));
4259   machine_mode pmode;
4260   int punsignedp, reversep, pvolatilep = 0;
4261   internal_fn ifn;
4262   tree offset_vectype;
4263   bool masked_p = false;
4264
4265   /* See whether this is already a call to a gather/scatter internal function.
4266      If not, see whether it's a masked load or store.  */
4267   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4268   if (call && gimple_call_internal_p (call))
4269     {
4270       ifn = gimple_call_internal_fn (call);
4271       if (internal_gather_scatter_fn_p (ifn))
4272         {
4273           vect_describe_gather_scatter_call (stmt_info, info);
4274           return true;
4275         }
4276       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4277     }
4278
4279   /* True if we should aim to use internal functions rather than
4280      built-in functions.  */
4281   bool use_ifn_p = (DR_IS_READ (dr)
4282                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4283                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4284
4285   base = DR_REF (dr);
4286   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4287      see if we can use the def stmt of the address.  */
4288   if (masked_p
4289       && TREE_CODE (base) == MEM_REF
4290       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4291       && integer_zerop (TREE_OPERAND (base, 1))
4292       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4293     {
4294       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4295       if (is_gimple_assign (def_stmt)
4296           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4297         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4298     }
4299
4300   /* The gather and scatter builtins need address of the form
4301      loop_invariant + vector * {1, 2, 4, 8}
4302      or
4303      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4304      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4305      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4306      multiplications and additions in it.  To get a vector, we need
4307      a single SSA_NAME that will be defined in the loop and will
4308      contain everything that is not loop invariant and that can be
4309      vectorized.  The following code attempts to find such a preexistng
4310      SSA_NAME OFF and put the loop invariants into a tree BASE
4311      that can be gimplified before the loop.  */
4312   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4313                               &punsignedp, &reversep, &pvolatilep);
4314   if (reversep)
4315     return false;
4316
4317   /* PR 107346.  Packed structs can have fields at offsets that are not
4318      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4319   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4320     return false;
4321
4322   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4323
4324   if (TREE_CODE (base) == MEM_REF)
4325     {
4326       if (!integer_zerop (TREE_OPERAND (base, 1)))
4327         {
4328           if (off == NULL_TREE)
4329             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4330           else
4331             off = size_binop (PLUS_EXPR, off,
4332                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4333         }
4334       base = TREE_OPERAND (base, 0);
4335     }
4336   else
4337     base = build_fold_addr_expr (base);
4338
4339   if (off == NULL_TREE)
4340     off = size_zero_node;
4341
4342   /* If base is not loop invariant, either off is 0, then we start with just
4343      the constant offset in the loop invariant BASE and continue with base
4344      as OFF, otherwise give up.
4345      We could handle that case by gimplifying the addition of base + off
4346      into some SSA_NAME and use that as off, but for now punt.  */
4347   if (!expr_invariant_in_loop_p (loop, base))
4348     {
4349       if (!integer_zerop (off))
4350         return false;
4351       off = base;
4352       base = size_int (pbytepos);
4353     }
4354   /* Otherwise put base + constant offset into the loop invariant BASE
4355      and continue with OFF.  */
4356   else
4357     {
4358       base = fold_convert (sizetype, base);
4359       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4360     }
4361
4362   /* OFF at this point may be either a SSA_NAME or some tree expression
4363      from get_inner_reference.  Try to peel off loop invariants from it
4364      into BASE as long as possible.  */
4365   STRIP_NOPS (off);
4366   while (offtype == NULL_TREE)
4367     {
4368       enum tree_code code;
4369       tree op0, op1, add = NULL_TREE;
4370
4371       if (TREE_CODE (off) == SSA_NAME)
4372         {
4373           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4374
4375           if (expr_invariant_in_loop_p (loop, off))
4376             return false;
4377
4378           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4379             break;
4380
4381           op0 = gimple_assign_rhs1 (def_stmt);
4382           code = gimple_assign_rhs_code (def_stmt);
4383           op1 = gimple_assign_rhs2 (def_stmt);
4384         }
4385       else
4386         {
4387           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4388             return false;
4389           code = TREE_CODE (off);
4390           extract_ops_from_tree (off, &code, &op0, &op1);
4391         }
4392       switch (code)
4393         {
4394         case POINTER_PLUS_EXPR:
4395         case PLUS_EXPR:
4396           if (expr_invariant_in_loop_p (loop, op0))
4397             {
4398               add = op0;
4399               off = op1;
4400             do_add:
4401               add = fold_convert (sizetype, add);
4402               if (scale != 1)
4403                 add = size_binop (MULT_EXPR, add, size_int (scale));
4404               base = size_binop (PLUS_EXPR, base, add);
4405               continue;
4406             }
4407           if (expr_invariant_in_loop_p (loop, op1))
4408             {
4409               add = op1;
4410               off = op0;
4411               goto do_add;
4412             }
4413           break;
4414         case MINUS_EXPR:
4415           if (expr_invariant_in_loop_p (loop, op1))
4416             {
4417               add = fold_convert (sizetype, op1);
4418               add = size_binop (MINUS_EXPR, size_zero_node, add);
4419               off = op0;
4420               goto do_add;
4421             }
4422           break;
4423         case MULT_EXPR:
4424           if (scale == 1 && tree_fits_shwi_p (op1))
4425             {
4426               int new_scale = tree_to_shwi (op1);
4427               /* Only treat this as a scaling operation if the target
4428                  supports it for at least some offset type.  */
4429               if (use_ifn_p
4430                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4431                                                 masked_p, vectype, memory_type,
4432                                                 signed_char_type_node,
4433                                                 new_scale, &ifn,
4434                                                 &offset_vectype)
4435                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4436                                                 masked_p, vectype, memory_type,
4437                                                 unsigned_char_type_node,
4438                                                 new_scale, &ifn,
4439                                                 &offset_vectype))
4440                 break;
4441               scale = new_scale;
4442               off = op0;
4443               continue;
4444             }
4445           break;
4446         case SSA_NAME:
4447           off = op0;
4448           continue;
4449         CASE_CONVERT:
4450           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4451               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4452             break;
4453
4454           /* Don't include the conversion if the target is happy with
4455              the current offset type.  */
4456           if (use_ifn_p
4457               && TREE_CODE (off) == SSA_NAME
4458               && !POINTER_TYPE_P (TREE_TYPE (off))
4459               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4460                                            masked_p, vectype, memory_type,
4461                                            TREE_TYPE (off), scale, &ifn,
4462                                            &offset_vectype))
4463             break;
4464
4465           if (TYPE_PRECISION (TREE_TYPE (op0))
4466               == TYPE_PRECISION (TREE_TYPE (off)))
4467             {
4468               off = op0;
4469               continue;
4470             }
4471
4472           /* Include the conversion if it is widening and we're using
4473              the IFN path or the target can handle the converted from
4474              offset or the current size is not already the same as the
4475              data vector element size.  */
4476           if ((TYPE_PRECISION (TREE_TYPE (op0))
4477                < TYPE_PRECISION (TREE_TYPE (off)))
4478               && (use_ifn_p
4479                   || (DR_IS_READ (dr)
4480                       ? (targetm.vectorize.builtin_gather
4481                          && targetm.vectorize.builtin_gather (vectype,
4482                                                               TREE_TYPE (op0),
4483                                                               scale))
4484                       : (targetm.vectorize.builtin_scatter
4485                          && targetm.vectorize.builtin_scatter (vectype,
4486                                                                TREE_TYPE (op0),
4487                                                                scale)))
4488                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4489                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4490             {
4491               off = op0;
4492               offtype = TREE_TYPE (off);
4493               STRIP_NOPS (off);
4494               continue;
4495             }
4496           break;
4497         default:
4498           break;
4499         }
4500       break;
4501     }
4502
4503   /* If at the end OFF still isn't a SSA_NAME or isn't
4504      defined in the loop, punt.  */
4505   if (TREE_CODE (off) != SSA_NAME
4506       || expr_invariant_in_loop_p (loop, off))
4507     return false;
4508
4509   if (offtype == NULL_TREE)
4510     offtype = TREE_TYPE (off);
4511
4512   if (use_ifn_p)
4513     {
4514       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4515                                      vectype, memory_type, offtype, scale,
4516                                      &ifn, &offset_vectype))
4517         ifn = IFN_LAST;
4518       decl = NULL_TREE;
4519     }
4520   else
4521     {
4522       if (DR_IS_READ (dr))
4523         {
4524           if (targetm.vectorize.builtin_gather)
4525             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4526         }
4527       else
4528         {
4529           if (targetm.vectorize.builtin_scatter)
4530             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4531         }
4532       ifn = IFN_LAST;
4533       /* The offset vector type will be read from DECL when needed.  */
4534       offset_vectype = NULL_TREE;
4535     }
4536
4537   info->ifn = ifn;
4538   info->decl = decl;
4539   info->base = base;
4540   info->offset = off;
4541   info->offset_dt = vect_unknown_def_type;
4542   info->offset_vectype = offset_vectype;
4543   info->scale = scale;
4544   info->element_type = TREE_TYPE (vectype);
4545   info->memory_type = memory_type;
4546   return true;
4547 }
4548
4549 /* Find the data references in STMT, analyze them with respect to LOOP and
4550    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4551    be handled.  */
4552
4553 opt_result
4554 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4555                                vec<data_reference_p> *datarefs,
4556                                vec<int> *dataref_groups, int group_id)
4557 {
4558   /* We can ignore clobbers for dataref analysis - they are removed during
4559      loop vectorization and BB vectorization checks dependences with a
4560      stmt walk.  */
4561   if (gimple_clobber_p (stmt))
4562     return opt_result::success ();
4563
4564   if (gimple_has_volatile_ops (stmt))
4565     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4566                                    stmt);
4567
4568   if (stmt_can_throw_internal (cfun, stmt))
4569     return opt_result::failure_at (stmt,
4570                                    "not vectorized:"
4571                                    " statement can throw an exception: %G",
4572                                    stmt);
4573
4574   auto_vec<data_reference_p, 2> refs;
4575   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4576   if (!res)
4577     return res;
4578
4579   if (refs.is_empty ())
4580     return opt_result::success ();
4581
4582   if (refs.length () > 1)
4583     {
4584       while (!refs.is_empty ())
4585         free_data_ref (refs.pop ());
4586       return opt_result::failure_at (stmt,
4587                                      "not vectorized: more than one "
4588                                      "data ref in stmt: %G", stmt);
4589     }
4590
4591   data_reference_p dr = refs.pop ();
4592   if (gcall *call = dyn_cast <gcall *> (stmt))
4593     if (!gimple_call_internal_p (call)
4594         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4595             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4596       {
4597         free_data_ref (dr);
4598         return opt_result::failure_at (stmt,
4599                                        "not vectorized: dr in a call %G", stmt);
4600       }
4601
4602   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4603       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4604     {
4605       free_data_ref (dr);
4606       return opt_result::failure_at (stmt,
4607                                      "not vectorized:"
4608                                      " statement is an unsupported"
4609                                      " bitfield access %G", stmt);
4610     }
4611
4612   if (DR_BASE_ADDRESS (dr)
4613       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4614     {
4615       free_data_ref (dr);
4616       return opt_result::failure_at (stmt,
4617                                      "not vectorized:"
4618                                      " base addr of dr is a constant\n");
4619     }
4620
4621   /* Check whether this may be a SIMD lane access and adjust the
4622      DR to make it easier for us to handle it.  */
4623   if (loop
4624       && loop->simduid
4625       && (!DR_BASE_ADDRESS (dr)
4626           || !DR_OFFSET (dr)
4627           || !DR_INIT (dr)
4628           || !DR_STEP (dr)))
4629     {
4630       struct data_reference *newdr
4631         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4632                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4633       if (DR_BASE_ADDRESS (newdr)
4634           && DR_OFFSET (newdr)
4635           && DR_INIT (newdr)
4636           && DR_STEP (newdr)
4637           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4638           && integer_zerop (DR_STEP (newdr)))
4639         {
4640           tree base_address = DR_BASE_ADDRESS (newdr);
4641           tree off = DR_OFFSET (newdr);
4642           tree step = ssize_int (1);
4643           if (integer_zerop (off)
4644               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4645             {
4646               off = TREE_OPERAND (base_address, 1);
4647               base_address = TREE_OPERAND (base_address, 0);
4648             }
4649           STRIP_NOPS (off);
4650           if (TREE_CODE (off) == MULT_EXPR
4651               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4652             {
4653               step = TREE_OPERAND (off, 1);
4654               off = TREE_OPERAND (off, 0);
4655               STRIP_NOPS (off);
4656             }
4657           if (CONVERT_EXPR_P (off)
4658               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4659                   < TYPE_PRECISION (TREE_TYPE (off))))
4660             off = TREE_OPERAND (off, 0);
4661           if (TREE_CODE (off) == SSA_NAME)
4662             {
4663               gimple *def = SSA_NAME_DEF_STMT (off);
4664               /* Look through widening conversion.  */
4665               if (is_gimple_assign (def)
4666                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4667                 {
4668                   tree rhs1 = gimple_assign_rhs1 (def);
4669                   if (TREE_CODE (rhs1) == SSA_NAME
4670                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4671                       && (TYPE_PRECISION (TREE_TYPE (off))
4672                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4673                     def = SSA_NAME_DEF_STMT (rhs1);
4674                 }
4675               if (is_gimple_call (def)
4676                   && gimple_call_internal_p (def)
4677                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4678                 {
4679                   tree arg = gimple_call_arg (def, 0);
4680                   tree reft = TREE_TYPE (DR_REF (newdr));
4681                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4682                   arg = SSA_NAME_VAR (arg);
4683                   if (arg == loop->simduid
4684                       /* For now.  */
4685                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4686                     {
4687                       DR_BASE_ADDRESS (newdr) = base_address;
4688                       DR_OFFSET (newdr) = ssize_int (0);
4689                       DR_STEP (newdr) = step;
4690                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4691                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4692                       /* Mark as simd-lane access.  */
4693                       tree arg2 = gimple_call_arg (def, 1);
4694                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4695                       free_data_ref (dr);
4696                       datarefs->safe_push (newdr);
4697                       if (dataref_groups)
4698                         dataref_groups->safe_push (group_id);
4699                       return opt_result::success ();
4700                     }
4701                 }
4702             }
4703         }
4704       free_data_ref (newdr);
4705     }
4706
4707   datarefs->safe_push (dr);
4708   if (dataref_groups)
4709     dataref_groups->safe_push (group_id);
4710   return opt_result::success ();
4711 }
4712
4713 /* Function vect_analyze_data_refs.
4714
4715   Find all the data references in the loop or basic block.
4716
4717    The general structure of the analysis of data refs in the vectorizer is as
4718    follows:
4719    1- vect_analyze_data_refs(loop/bb): call
4720       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4721       in the loop/bb and their dependences.
4722    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4723    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4724    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4725
4726 */
4727
4728 opt_result
4729 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4730 {
4731   class loop *loop = NULL;
4732   unsigned int i;
4733   struct data_reference *dr;
4734   tree scalar_type;
4735
4736   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4737
4738   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4739     loop = LOOP_VINFO_LOOP (loop_vinfo);
4740
4741   /* Go through the data-refs, check that the analysis succeeded.  Update
4742      pointer from stmt_vec_info struct to DR and vectype.  */
4743
4744   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4745   FOR_EACH_VEC_ELT (datarefs, i, dr)
4746     {
4747       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4748       poly_uint64 vf;
4749
4750       gcc_assert (DR_REF (dr));
4751       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4752       gcc_assert (!stmt_info->dr_aux.dr);
4753       stmt_info->dr_aux.dr = dr;
4754       stmt_info->dr_aux.stmt = stmt_info;
4755
4756       /* Check that analysis of the data-ref succeeded.  */
4757       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4758           || !DR_STEP (dr))
4759         {
4760           bool maybe_gather
4761             = DR_IS_READ (dr)
4762               && !TREE_THIS_VOLATILE (DR_REF (dr));
4763           bool maybe_scatter
4764             = DR_IS_WRITE (dr)
4765               && !TREE_THIS_VOLATILE (DR_REF (dr));
4766
4767           /* If target supports vector gather loads or scatter stores,
4768              see if they can't be used.  */
4769           if (is_a <loop_vec_info> (vinfo)
4770               && !nested_in_vect_loop_p (loop, stmt_info))
4771             {
4772               if (maybe_gather || maybe_scatter)
4773                 {
4774                   if (maybe_gather)
4775                     gatherscatter = GATHER;
4776                   else
4777                     gatherscatter = SCATTER;
4778                 }
4779             }
4780
4781           if (gatherscatter == SG_NONE)
4782             {
4783               if (dump_enabled_p ())
4784                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4785                                  "not vectorized: data ref analysis "
4786                                  "failed %G", stmt_info->stmt);
4787               if (is_a <bb_vec_info> (vinfo))
4788                 {
4789                   /* In BB vectorization the ref can still participate
4790                      in dependence analysis, we just can't vectorize it.  */
4791                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4792                   continue;
4793                 }
4794               return opt_result::failure_at (stmt_info->stmt,
4795                                              "not vectorized:"
4796                                              " data ref analysis failed: %G",
4797                                              stmt_info->stmt);
4798             }
4799         }
4800
4801       /* See if this was detected as SIMD lane access.  */
4802       if (dr->aux == (void *)-1
4803           || dr->aux == (void *)-2
4804           || dr->aux == (void *)-3
4805           || dr->aux == (void *)-4)
4806         {
4807           if (nested_in_vect_loop_p (loop, stmt_info))
4808             return opt_result::failure_at (stmt_info->stmt,
4809                                            "not vectorized:"
4810                                            " data ref analysis failed: %G",
4811                                            stmt_info->stmt);
4812           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4813             = -(uintptr_t) dr->aux;
4814         }
4815
4816       tree base = get_base_address (DR_REF (dr));
4817       if (base && VAR_P (base) && DECL_NONALIASED (base))
4818         {
4819           if (dump_enabled_p ())
4820             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4821                              "not vectorized: base object not addressable "
4822                              "for stmt: %G", stmt_info->stmt);
4823           if (is_a <bb_vec_info> (vinfo))
4824             {
4825               /* In BB vectorization the ref can still participate
4826                  in dependence analysis, we just can't vectorize it.  */
4827               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4828               continue;
4829             }
4830           return opt_result::failure_at (stmt_info->stmt,
4831                                          "not vectorized: base object not"
4832                                          " addressable for stmt: %G",
4833                                          stmt_info->stmt);
4834         }
4835
4836       if (is_a <loop_vec_info> (vinfo)
4837           && DR_STEP (dr)
4838           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4839         {
4840           if (nested_in_vect_loop_p (loop, stmt_info))
4841             return opt_result::failure_at (stmt_info->stmt,
4842                                            "not vectorized: "
4843                                            "not suitable for strided load %G",
4844                                            stmt_info->stmt);
4845           STMT_VINFO_STRIDED_P (stmt_info) = true;
4846         }
4847
4848       /* Update DR field in stmt_vec_info struct.  */
4849
4850       /* If the dataref is in an inner-loop of the loop that is considered for
4851          for vectorization, we also want to analyze the access relative to
4852          the outer-loop (DR contains information only relative to the
4853          inner-most enclosing loop).  We do that by building a reference to the
4854          first location accessed by the inner-loop, and analyze it relative to
4855          the outer-loop.  */
4856       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4857         {
4858           /* Build a reference to the first location accessed by the
4859              inner loop: *(BASE + INIT + OFFSET).  By construction,
4860              this address must be invariant in the inner loop, so we
4861              can consider it as being used in the outer loop.  */
4862           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4863           tree offset = unshare_expr (DR_OFFSET (dr));
4864           tree init = unshare_expr (DR_INIT (dr));
4865           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4866                                           init, offset);
4867           tree init_addr = fold_build_pointer_plus (base, init_offset);
4868           tree init_ref = build_fold_indirect_ref (init_addr);
4869
4870           if (dump_enabled_p ())
4871             dump_printf_loc (MSG_NOTE, vect_location,
4872                              "analyze in outer loop: %T\n", init_ref);
4873
4874           opt_result res
4875             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4876                                     init_ref, loop, stmt_info->stmt);
4877           if (!res)
4878             /* dr_analyze_innermost already explained the failure.  */
4879             return res;
4880
4881           if (dump_enabled_p ())
4882             dump_printf_loc (MSG_NOTE, vect_location,
4883                              "\touter base_address: %T\n"
4884                              "\touter offset from base address: %T\n"
4885                              "\touter constant offset from base address: %T\n"
4886                              "\touter step: %T\n"
4887                              "\touter base alignment: %d\n\n"
4888                              "\touter base misalignment: %d\n"
4889                              "\touter offset alignment: %d\n"
4890                              "\touter step alignment: %d\n",
4891                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4892                              STMT_VINFO_DR_OFFSET (stmt_info),
4893                              STMT_VINFO_DR_INIT (stmt_info),
4894                              STMT_VINFO_DR_STEP (stmt_info),
4895                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4896                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4897                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4898                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4899         }
4900
4901       /* Set vectype for STMT.  */
4902       scalar_type = TREE_TYPE (DR_REF (dr));
4903       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4904       if (!vectype)
4905         {
4906           if (dump_enabled_p ())
4907             {
4908               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4909                                "not vectorized: no vectype for stmt: %G",
4910                                stmt_info->stmt);
4911               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4912               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4913                                  scalar_type);
4914               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4915             }
4916
4917           if (is_a <bb_vec_info> (vinfo))
4918             {
4919               /* No vector type is fine, the ref can still participate
4920                  in dependence analysis, we just can't vectorize it.  */
4921               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4922               continue;
4923             }
4924           if (fatal)
4925             *fatal = false;
4926           return opt_result::failure_at (stmt_info->stmt,
4927                                          "not vectorized:"
4928                                          " no vectype for stmt: %G"
4929                                          " scalar_type: %T\n",
4930                                          stmt_info->stmt, scalar_type);
4931         }
4932       else
4933         {
4934           if (dump_enabled_p ())
4935             dump_printf_loc (MSG_NOTE, vect_location,
4936                              "got vectype for stmt: %G%T\n",
4937                              stmt_info->stmt, vectype);
4938         }
4939
4940       /* Adjust the minimal vectorization factor according to the
4941          vector type.  */
4942       vf = TYPE_VECTOR_SUBPARTS (vectype);
4943       *min_vf = upper_bound (*min_vf, vf);
4944
4945       /* Leave the BB vectorizer to pick the vector type later, based on
4946          the final dataref group size and SLP node size.  */
4947       if (is_a <loop_vec_info> (vinfo))
4948         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4949
4950       if (gatherscatter != SG_NONE)
4951         {
4952           gather_scatter_info gs_info;
4953           if (!vect_check_gather_scatter (stmt_info,
4954                                           as_a <loop_vec_info> (vinfo),
4955                                           &gs_info)
4956               || !get_vectype_for_scalar_type (vinfo,
4957                                                TREE_TYPE (gs_info.offset)))
4958             {
4959               if (fatal)
4960                 *fatal = false;
4961               return opt_result::failure_at
4962                         (stmt_info->stmt,
4963                          (gatherscatter == GATHER)
4964                          ? "not vectorized: not suitable for gather load %G"
4965                          : "not vectorized: not suitable for scatter store %G",
4966                          stmt_info->stmt);
4967             }
4968           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4969         }
4970     }
4971
4972   /* We used to stop processing and prune the list here.  Verify we no
4973      longer need to.  */
4974   gcc_assert (i == datarefs.length ());
4975
4976   return opt_result::success ();
4977 }
4978
4979
4980 /* Function vect_get_new_vect_var.
4981
4982    Returns a name for a new variable.  The current naming scheme appends the
4983    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4984    the name of vectorizer generated variables, and appends that to NAME if
4985    provided.  */
4986
4987 tree
4988 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4989 {
4990   const char *prefix;
4991   tree new_vect_var;
4992
4993   switch (var_kind)
4994   {
4995   case vect_simple_var:
4996     prefix = "vect";
4997     break;
4998   case vect_scalar_var:
4999     prefix = "stmp";
5000     break;
5001   case vect_mask_var:
5002     prefix = "mask";
5003     break;
5004   case vect_pointer_var:
5005     prefix = "vectp";
5006     break;
5007   default:
5008     gcc_unreachable ();
5009   }
5010
5011   if (name)
5012     {
5013       char* tmp = concat (prefix, "_", name, NULL);
5014       new_vect_var = create_tmp_reg (type, tmp);
5015       free (tmp);
5016     }
5017   else
5018     new_vect_var = create_tmp_reg (type, prefix);
5019
5020   return new_vect_var;
5021 }
5022
5023 /* Like vect_get_new_vect_var but return an SSA name.  */
5024
5025 tree
5026 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5027 {
5028   const char *prefix;
5029   tree new_vect_var;
5030
5031   switch (var_kind)
5032   {
5033   case vect_simple_var:
5034     prefix = "vect";
5035     break;
5036   case vect_scalar_var:
5037     prefix = "stmp";
5038     break;
5039   case vect_pointer_var:
5040     prefix = "vectp";
5041     break;
5042   default:
5043     gcc_unreachable ();
5044   }
5045
5046   if (name)
5047     {
5048       char* tmp = concat (prefix, "_", name, NULL);
5049       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5050       free (tmp);
5051     }
5052   else
5053     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5054
5055   return new_vect_var;
5056 }
5057
5058 /* Duplicate points-to info on NAME from DR_INFO.  */
5059
5060 static void
5061 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5062 {
5063   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5064   /* DR_PTR_INFO is for a base SSA name, not including constant or
5065      variable offsets in the ref so its alignment info does not apply.  */
5066   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5067 }
5068
5069 /* Function vect_create_addr_base_for_vector_ref.
5070
5071    Create an expression that computes the address of the first memory location
5072    that will be accessed for a data reference.
5073
5074    Input:
5075    STMT_INFO: The statement containing the data reference.
5076    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5077    OFFSET: Optional. If supplied, it is be added to the initial address.
5078    LOOP:    Specify relative to which loop-nest should the address be computed.
5079             For example, when the dataref is in an inner-loop nested in an
5080             outer-loop that is now being vectorized, LOOP can be either the
5081             outer-loop, or the inner-loop.  The first memory location accessed
5082             by the following dataref ('in' points to short):
5083
5084                 for (i=0; i<N; i++)
5085                    for (j=0; j<M; j++)
5086                      s += in[i+j]
5087
5088             is as follows:
5089             if LOOP=i_loop:     &in             (relative to i_loop)
5090             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5091
5092    Output:
5093    1. Return an SSA_NAME whose value is the address of the memory location of
5094       the first vector of the data reference.
5095    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5096       these statement(s) which define the returned SSA_NAME.
5097
5098    FORNOW: We are only handling array accesses with step 1.  */
5099
5100 tree
5101 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5102                                       gimple_seq *new_stmt_list,
5103                                       tree offset)
5104 {
5105   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5106   struct data_reference *dr = dr_info->dr;
5107   const char *base_name;
5108   tree addr_base;
5109   tree dest;
5110   gimple_seq seq = NULL;
5111   tree vect_ptr_type;
5112   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5113   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5114
5115   tree data_ref_base = unshare_expr (drb->base_address);
5116   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5117   tree init = unshare_expr (drb->init);
5118
5119   if (loop_vinfo)
5120     base_name = get_name (data_ref_base);
5121   else
5122     {
5123       base_offset = ssize_int (0);
5124       init = ssize_int (0);
5125       base_name = get_name (DR_REF (dr));
5126     }
5127
5128   /* Create base_offset */
5129   base_offset = size_binop (PLUS_EXPR,
5130                             fold_convert (sizetype, base_offset),
5131                             fold_convert (sizetype, init));
5132
5133   if (offset)
5134     {
5135       offset = fold_convert (sizetype, offset);
5136       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5137                                  base_offset, offset);
5138     }
5139
5140   /* base + base_offset */
5141   if (loop_vinfo)
5142     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5143   else
5144     addr_base = build1 (ADDR_EXPR,
5145                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5146                         /* Strip zero offset components since we don't need
5147                            them and they can confuse late diagnostics if
5148                            we CSE them wrongly.  See PR106904 for example.  */
5149                         unshare_expr (strip_zero_offset_components
5150                                                                 (DR_REF (dr))));
5151
5152   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5153   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5154   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5155   gimple_seq_add_seq (new_stmt_list, seq);
5156
5157   if (DR_PTR_INFO (dr)
5158       && TREE_CODE (addr_base) == SSA_NAME
5159       /* We should only duplicate pointer info to newly created SSA names.  */
5160       && SSA_NAME_VAR (addr_base) == dest)
5161     {
5162       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5163       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5164     }
5165
5166   if (dump_enabled_p ())
5167     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5168
5169   return addr_base;
5170 }
5171
5172
5173 /* Function vect_create_data_ref_ptr.
5174
5175    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5176    location accessed in the loop by STMT_INFO, along with the def-use update
5177    chain to appropriately advance the pointer through the loop iterations.
5178    Also set aliasing information for the pointer.  This pointer is used by
5179    the callers to this function to create a memory reference expression for
5180    vector load/store access.
5181
5182    Input:
5183    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5184          GIMPLE_ASSIGN <name, data-ref> or
5185          GIMPLE_ASSIGN <data-ref, name>.
5186    2. AGGR_TYPE: the type of the reference, which should be either a vector
5187         or an array.
5188    3. AT_LOOP: the loop where the vector memref is to be created.
5189    4. OFFSET (optional): a byte offset to be added to the initial address
5190         accessed by the data-ref in STMT_INFO.
5191    5. BSI: location where the new stmts are to be placed if there is no loop
5192    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5193         pointing to the initial address.
5194    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5195         to the IV during each iteration of the loop.  NULL says to move
5196         by one copy of AGGR_TYPE up or down, depending on the step of the
5197         data reference.
5198
5199    Output:
5200    1. Declare a new ptr to vector_type, and have it point to the base of the
5201       data reference (initial addressed accessed by the data reference).
5202       For example, for vector of type V8HI, the following code is generated:
5203
5204       v8hi *ap;
5205       ap = (v8hi *)initial_address;
5206
5207       if OFFSET is not supplied:
5208          initial_address = &a[init];
5209       if OFFSET is supplied:
5210          initial_address = &a[init] + OFFSET;
5211       if BYTE_OFFSET is supplied:
5212          initial_address = &a[init] + BYTE_OFFSET;
5213
5214       Return the initial_address in INITIAL_ADDRESS.
5215
5216    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5217       update the pointer in each iteration of the loop.
5218
5219       Return the increment stmt that updates the pointer in PTR_INCR.
5220
5221    3. Return the pointer.  */
5222
5223 tree
5224 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5225                           tree aggr_type, class loop *at_loop, tree offset,
5226                           tree *initial_address, gimple_stmt_iterator *gsi,
5227                           gimple **ptr_incr, bool only_init,
5228                           tree iv_step)
5229 {
5230   const char *base_name;
5231   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5232   class loop *loop = NULL;
5233   bool nested_in_vect_loop = false;
5234   class loop *containing_loop = NULL;
5235   tree aggr_ptr_type;
5236   tree aggr_ptr;
5237   tree new_temp;
5238   gimple_seq new_stmt_list = NULL;
5239   edge pe = NULL;
5240   basic_block new_bb;
5241   tree aggr_ptr_init;
5242   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5243   struct data_reference *dr = dr_info->dr;
5244   tree aptr;
5245   gimple_stmt_iterator incr_gsi;
5246   bool insert_after;
5247   tree indx_before_incr, indx_after_incr;
5248   gimple *incr;
5249   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5250
5251   gcc_assert (iv_step != NULL_TREE
5252               || TREE_CODE (aggr_type) == ARRAY_TYPE
5253               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5254
5255   if (loop_vinfo)
5256     {
5257       loop = LOOP_VINFO_LOOP (loop_vinfo);
5258       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5259       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5260       pe = loop_preheader_edge (loop);
5261     }
5262   else
5263     {
5264       gcc_assert (bb_vinfo);
5265       only_init = true;
5266       *ptr_incr = NULL;
5267     }
5268
5269   /* Create an expression for the first address accessed by this load
5270      in LOOP.  */
5271   base_name = get_name (DR_BASE_ADDRESS (dr));
5272
5273   if (dump_enabled_p ())
5274     {
5275       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5276       dump_printf_loc (MSG_NOTE, vect_location,
5277                        "create %s-pointer variable to type: %T",
5278                        get_tree_code_name (TREE_CODE (aggr_type)),
5279                        aggr_type);
5280       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5281         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5282       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5283         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5284       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5285         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5286       else
5287         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5288       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5289     }
5290
5291   /* (1) Create the new aggregate-pointer variable.
5292      Vector and array types inherit the alias set of their component
5293      type by default so we need to use a ref-all pointer if the data
5294      reference does not conflict with the created aggregated data
5295      reference because it is not addressable.  */
5296   bool need_ref_all = false;
5297   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5298                               get_alias_set (DR_REF (dr))))
5299     need_ref_all = true;
5300   /* Likewise for any of the data references in the stmt group.  */
5301   else if (DR_GROUP_SIZE (stmt_info) > 1)
5302     {
5303       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5304       do
5305         {
5306           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5307           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5308                                       get_alias_set (DR_REF (sdr))))
5309             {
5310               need_ref_all = true;
5311               break;
5312             }
5313           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5314         }
5315       while (sinfo);
5316     }
5317   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5318                                                need_ref_all);
5319   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5320
5321
5322   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5323      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5324      def-use update cycles for the pointer: one relative to the outer-loop
5325      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5326      to the inner-loop (which is the inner-most loop containing the dataref),
5327      and this is done be step (5) below.
5328
5329      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5330      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5331      redundant.  Steps (3),(4) create the following:
5332
5333         vp0 = &base_addr;
5334         LOOP:   vp1 = phi(vp0,vp2)
5335                 ...
5336                 ...
5337                 vp2 = vp1 + step
5338                 goto LOOP
5339
5340      If there is an inner-loop nested in loop, then step (5) will also be
5341      applied, and an additional update in the inner-loop will be created:
5342
5343         vp0 = &base_addr;
5344         LOOP:   vp1 = phi(vp0,vp2)
5345                 ...
5346         inner:     vp3 = phi(vp1,vp4)
5347                    vp4 = vp3 + inner_step
5348                    if () goto inner
5349                 ...
5350                 vp2 = vp1 + step
5351                 if () goto LOOP   */
5352
5353   /* (2) Calculate the initial address of the aggregate-pointer, and set
5354      the aggregate-pointer to point to it before the loop.  */
5355
5356   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5357
5358   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5359                                                    stmt_info, &new_stmt_list,
5360                                                    offset);
5361   if (new_stmt_list)
5362     {
5363       if (pe)
5364         {
5365           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5366           gcc_assert (!new_bb);
5367         }
5368       else
5369         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5370     }
5371
5372   *initial_address = new_temp;
5373   aggr_ptr_init = new_temp;
5374
5375   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5376      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5377      inner-loop nested in LOOP (during outer-loop vectorization).  */
5378
5379   /* No update in loop is required.  */
5380   if (only_init && (!loop_vinfo || at_loop == loop))
5381     aptr = aggr_ptr_init;
5382   else
5383     {
5384       /* Accesses to invariant addresses should be handled specially
5385          by the caller.  */
5386       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5387       gcc_assert (!integer_zerop (step));
5388
5389       if (iv_step == NULL_TREE)
5390         {
5391           /* The step of the aggregate pointer is the type size,
5392              negated for downward accesses.  */
5393           iv_step = TYPE_SIZE_UNIT (aggr_type);
5394           if (tree_int_cst_sgn (step) == -1)
5395             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5396         }
5397
5398       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5399
5400       create_iv (aggr_ptr_init, PLUS_EXPR,
5401                  fold_convert (aggr_ptr_type, iv_step),
5402                  aggr_ptr, loop, &incr_gsi, insert_after,
5403                  &indx_before_incr, &indx_after_incr);
5404       incr = gsi_stmt (incr_gsi);
5405
5406       /* Copy the points-to information if it exists. */
5407       if (DR_PTR_INFO (dr))
5408         {
5409           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5410           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5411         }
5412       if (ptr_incr)
5413         *ptr_incr = incr;
5414
5415       aptr = indx_before_incr;
5416     }
5417
5418   if (!nested_in_vect_loop || only_init)
5419     return aptr;
5420
5421
5422   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5423      nested in LOOP, if exists.  */
5424
5425   gcc_assert (nested_in_vect_loop);
5426   if (!only_init)
5427     {
5428       standard_iv_increment_position (containing_loop, &incr_gsi,
5429                                       &insert_after);
5430       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5431                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5432                  &indx_before_incr, &indx_after_incr);
5433       incr = gsi_stmt (incr_gsi);
5434
5435       /* Copy the points-to information if it exists. */
5436       if (DR_PTR_INFO (dr))
5437         {
5438           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5439           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5440         }
5441       if (ptr_incr)
5442         *ptr_incr = incr;
5443
5444       return indx_before_incr;
5445     }
5446   else
5447     gcc_unreachable ();
5448 }
5449
5450
5451 /* Function bump_vector_ptr
5452
5453    Increment a pointer (to a vector type) by vector-size. If requested,
5454    i.e. if PTR-INCR is given, then also connect the new increment stmt
5455    to the existing def-use update-chain of the pointer, by modifying
5456    the PTR_INCR as illustrated below:
5457
5458    The pointer def-use update-chain before this function:
5459                         DATAREF_PTR = phi (p_0, p_2)
5460                         ....
5461         PTR_INCR:       p_2 = DATAREF_PTR + step
5462
5463    The pointer def-use update-chain after this function:
5464                         DATAREF_PTR = phi (p_0, p_2)
5465                         ....
5466                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5467                         ....
5468         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5469
5470    Input:
5471    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5472                  in the loop.
5473    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5474               the loop.  The increment amount across iterations is expected
5475               to be vector_size.
5476    BSI - location where the new update stmt is to be placed.
5477    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5478    BUMP - optional. The offset by which to bump the pointer. If not given,
5479           the offset is assumed to be vector_size.
5480
5481    Output: Return NEW_DATAREF_PTR as illustrated above.
5482
5483 */
5484
5485 tree
5486 bump_vector_ptr (vec_info *vinfo,
5487                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5488                  stmt_vec_info stmt_info, tree bump)
5489 {
5490   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5491   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5492   tree update = TYPE_SIZE_UNIT (vectype);
5493   gimple *incr_stmt;
5494   ssa_op_iter iter;
5495   use_operand_p use_p;
5496   tree new_dataref_ptr;
5497
5498   if (bump)
5499     update = bump;
5500
5501   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5502     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5503   else if (is_gimple_min_invariant (dataref_ptr))
5504     /* When possible avoid emitting a separate increment stmt that will
5505        force the addressed object addressable.  */
5506     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5507                    fold_build2 (MEM_REF,
5508                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5509                                 dataref_ptr,
5510                                 fold_convert (ptr_type_node, update)));
5511   else
5512     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5513   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5514                                    dataref_ptr, update);
5515   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5516   /* Fold the increment, avoiding excessive chains use-def chains of
5517      those, leading to compile-time issues for passes until the next
5518      forwprop pass which would do this as well.  */
5519   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5520   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5521     {
5522       incr_stmt = gsi_stmt (fold_gsi);
5523       update_stmt (incr_stmt);
5524     }
5525
5526   /* Copy the points-to information if it exists. */
5527   if (DR_PTR_INFO (dr))
5528     {
5529       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5530       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5531     }
5532
5533   if (!ptr_incr)
5534     return new_dataref_ptr;
5535
5536   /* Update the vector-pointer's cross-iteration increment.  */
5537   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5538     {
5539       tree use = USE_FROM_PTR (use_p);
5540
5541       if (use == dataref_ptr)
5542         SET_USE (use_p, new_dataref_ptr);
5543       else
5544         gcc_assert (operand_equal_p (use, update, 0));
5545     }
5546
5547   return new_dataref_ptr;
5548 }
5549
5550
5551 /* Copy memory reference info such as base/clique from the SRC reference
5552    to the DEST MEM_REF.  */
5553
5554 void
5555 vect_copy_ref_info (tree dest, tree src)
5556 {
5557   if (TREE_CODE (dest) != MEM_REF)
5558     return;
5559
5560   tree src_base = src;
5561   while (handled_component_p (src_base))
5562     src_base = TREE_OPERAND (src_base, 0);
5563   if (TREE_CODE (src_base) != MEM_REF
5564       && TREE_CODE (src_base) != TARGET_MEM_REF)
5565     return;
5566
5567   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5568   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5569 }
5570
5571
5572 /* Function vect_create_destination_var.
5573
5574    Create a new temporary of type VECTYPE.  */
5575
5576 tree
5577 vect_create_destination_var (tree scalar_dest, tree vectype)
5578 {
5579   tree vec_dest;
5580   const char *name;
5581   char *new_name;
5582   tree type;
5583   enum vect_var_kind kind;
5584
5585   kind = vectype
5586     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5587     ? vect_mask_var
5588     : vect_simple_var
5589     : vect_scalar_var;
5590   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5591
5592   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5593
5594   name = get_name (scalar_dest);
5595   if (name)
5596     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5597   else
5598     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5599   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5600   free (new_name);
5601
5602   return vec_dest;
5603 }
5604
5605 /* Function vect_grouped_store_supported.
5606
5607    Returns TRUE if interleave high and interleave low permutations
5608    are supported, and FALSE otherwise.  */
5609
5610 bool
5611 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5612 {
5613   machine_mode mode = TYPE_MODE (vectype);
5614
5615   /* vect_permute_store_chain requires the group size to be equal to 3 or
5616      be a power of two.  */
5617   if (count != 3 && exact_log2 (count) == -1)
5618     {
5619       if (dump_enabled_p ())
5620         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5621                          "the size of the group of accesses"
5622                          " is not a power of 2 or not eqaul to 3\n");
5623       return false;
5624     }
5625
5626   /* Check that the permutation is supported.  */
5627   if (VECTOR_MODE_P (mode))
5628     {
5629       unsigned int i;
5630       if (count == 3)
5631         {
5632           unsigned int j0 = 0, j1 = 0, j2 = 0;
5633           unsigned int i, j;
5634
5635           unsigned int nelt;
5636           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5637             {
5638               if (dump_enabled_p ())
5639                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5640                                  "cannot handle groups of 3 stores for"
5641                                  " variable-length vectors\n");
5642               return false;
5643             }
5644
5645           vec_perm_builder sel (nelt, nelt, 1);
5646           sel.quick_grow (nelt);
5647           vec_perm_indices indices;
5648           for (j = 0; j < 3; j++)
5649             {
5650               int nelt0 = ((3 - j) * nelt) % 3;
5651               int nelt1 = ((3 - j) * nelt + 1) % 3;
5652               int nelt2 = ((3 - j) * nelt + 2) % 3;
5653               for (i = 0; i < nelt; i++)
5654                 {
5655                   if (3 * i + nelt0 < nelt)
5656                     sel[3 * i + nelt0] = j0++;
5657                   if (3 * i + nelt1 < nelt)
5658                     sel[3 * i + nelt1] = nelt + j1++;
5659                   if (3 * i + nelt2 < nelt)
5660                     sel[3 * i + nelt2] = 0;
5661                 }
5662               indices.new_vector (sel, 2, nelt);
5663               if (!can_vec_perm_const_p (mode, mode, indices))
5664                 {
5665                   if (dump_enabled_p ())
5666                     dump_printf (MSG_MISSED_OPTIMIZATION,
5667                                  "permutation op not supported by target.\n");
5668                   return false;
5669                 }
5670
5671               for (i = 0; i < nelt; i++)
5672                 {
5673                   if (3 * i + nelt0 < nelt)
5674                     sel[3 * i + nelt0] = 3 * i + nelt0;
5675                   if (3 * i + nelt1 < nelt)
5676                     sel[3 * i + nelt1] = 3 * i + nelt1;
5677                   if (3 * i + nelt2 < nelt)
5678                     sel[3 * i + nelt2] = nelt + j2++;
5679                 }
5680               indices.new_vector (sel, 2, nelt);
5681               if (!can_vec_perm_const_p (mode, mode, indices))
5682                 {
5683                   if (dump_enabled_p ())
5684                     dump_printf (MSG_MISSED_OPTIMIZATION,
5685                                  "permutation op not supported by target.\n");
5686                   return false;
5687                 }
5688             }
5689           return true;
5690         }
5691       else
5692         {
5693           /* If length is not equal to 3 then only power of 2 is supported.  */
5694           gcc_assert (pow2p_hwi (count));
5695           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5696
5697           /* The encoding has 2 interleaved stepped patterns.  */
5698           if(!multiple_p (nelt, 2))
5699             return false;
5700           vec_perm_builder sel (nelt, 2, 3);
5701           sel.quick_grow (6);
5702           for (i = 0; i < 3; i++)
5703             {
5704               sel[i * 2] = i;
5705               sel[i * 2 + 1] = i + nelt;
5706             }
5707           vec_perm_indices indices (sel, 2, nelt);
5708           if (can_vec_perm_const_p (mode, mode, indices))
5709             {
5710               for (i = 0; i < 6; i++)
5711                 sel[i] += exact_div (nelt, 2);
5712               indices.new_vector (sel, 2, nelt);
5713               if (can_vec_perm_const_p (mode, mode, indices))
5714                 return true;
5715             }
5716         }
5717     }
5718
5719   if (dump_enabled_p ())
5720     dump_printf (MSG_MISSED_OPTIMIZATION,
5721                  "permutation op not supported by target.\n");
5722   return false;
5723 }
5724
5725 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5726    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5727
5728 internal_fn
5729 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5730                             bool masked_p)
5731 {
5732   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5733                                     vec_mask_len_store_lanes_optab, vectype,
5734                                     count))
5735     return IFN_MASK_LEN_STORE_LANES;
5736   else if (masked_p)
5737     {
5738       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5739                                         vec_mask_store_lanes_optab, vectype,
5740                                         count))
5741         return IFN_MASK_STORE_LANES;
5742     }
5743   else
5744     {
5745       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5746                                         vec_store_lanes_optab, vectype, count))
5747         return IFN_STORE_LANES;
5748     }
5749   return IFN_LAST;
5750 }
5751
5752
5753 /* Function vect_permute_store_chain.
5754
5755    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5756    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5757    the data correctly for the stores.  Return the final references for stores
5758    in RESULT_CHAIN.
5759
5760    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5761    The input is 4 vectors each containing 8 elements.  We assign a number to
5762    each element, the input sequence is:
5763
5764    1st vec:   0  1  2  3  4  5  6  7
5765    2nd vec:   8  9 10 11 12 13 14 15
5766    3rd vec:  16 17 18 19 20 21 22 23
5767    4th vec:  24 25 26 27 28 29 30 31
5768
5769    The output sequence should be:
5770
5771    1st vec:  0  8 16 24  1  9 17 25
5772    2nd vec:  2 10 18 26  3 11 19 27
5773    3rd vec:  4 12 20 28  5 13 21 30
5774    4th vec:  6 14 22 30  7 15 23 31
5775
5776    i.e., we interleave the contents of the four vectors in their order.
5777
5778    We use interleave_high/low instructions to create such output.  The input of
5779    each interleave_high/low operation is two vectors:
5780    1st vec    2nd vec
5781    0 1 2 3    4 5 6 7
5782    the even elements of the result vector are obtained left-to-right from the
5783    high/low elements of the first vector.  The odd elements of the result are
5784    obtained left-to-right from the high/low elements of the second vector.
5785    The output of interleave_high will be:   0 4 1 5
5786    and of interleave_low:                   2 6 3 7
5787
5788
5789    The permutation is done in log LENGTH stages.  In each stage interleave_high
5790    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5791    where the first argument is taken from the first half of DR_CHAIN and the
5792    second argument from it's second half.
5793    In our example,
5794
5795    I1: interleave_high (1st vec, 3rd vec)
5796    I2: interleave_low (1st vec, 3rd vec)
5797    I3: interleave_high (2nd vec, 4th vec)
5798    I4: interleave_low (2nd vec, 4th vec)
5799
5800    The output for the first stage is:
5801
5802    I1:  0 16  1 17  2 18  3 19
5803    I2:  4 20  5 21  6 22  7 23
5804    I3:  8 24  9 25 10 26 11 27
5805    I4: 12 28 13 29 14 30 15 31
5806
5807    The output of the second stage, i.e. the final result is:
5808
5809    I1:  0  8 16 24  1  9 17 25
5810    I2:  2 10 18 26  3 11 19 27
5811    I3:  4 12 20 28  5 13 21 30
5812    I4:  6 14 22 30  7 15 23 31.  */
5813
5814 void
5815 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5816                           unsigned int length,
5817                           stmt_vec_info stmt_info,
5818                           gimple_stmt_iterator *gsi,
5819                           vec<tree> *result_chain)
5820 {
5821   tree vect1, vect2, high, low;
5822   gimple *perm_stmt;
5823   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5824   tree perm_mask_low, perm_mask_high;
5825   tree data_ref;
5826   tree perm3_mask_low, perm3_mask_high;
5827   unsigned int i, j, n, log_length = exact_log2 (length);
5828
5829   result_chain->quick_grow (length);
5830   memcpy (result_chain->address (), dr_chain.address (),
5831           length * sizeof (tree));
5832
5833   if (length == 3)
5834     {
5835       /* vect_grouped_store_supported ensures that this is constant.  */
5836       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5837       unsigned int j0 = 0, j1 = 0, j2 = 0;
5838
5839       vec_perm_builder sel (nelt, nelt, 1);
5840       sel.quick_grow (nelt);
5841       vec_perm_indices indices;
5842       for (j = 0; j < 3; j++)
5843         {
5844           int nelt0 = ((3 - j) * nelt) % 3;
5845           int nelt1 = ((3 - j) * nelt + 1) % 3;
5846           int nelt2 = ((3 - j) * nelt + 2) % 3;
5847
5848           for (i = 0; i < nelt; i++)
5849             {
5850               if (3 * i + nelt0 < nelt)
5851                 sel[3 * i + nelt0] = j0++;
5852               if (3 * i + nelt1 < nelt)
5853                 sel[3 * i + nelt1] = nelt + j1++;
5854               if (3 * i + nelt2 < nelt)
5855                 sel[3 * i + nelt2] = 0;
5856             }
5857           indices.new_vector (sel, 2, nelt);
5858           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5859
5860           for (i = 0; i < nelt; i++)
5861             {
5862               if (3 * i + nelt0 < nelt)
5863                 sel[3 * i + nelt0] = 3 * i + nelt0;
5864               if (3 * i + nelt1 < nelt)
5865                 sel[3 * i + nelt1] = 3 * i + nelt1;
5866               if (3 * i + nelt2 < nelt)
5867                 sel[3 * i + nelt2] = nelt + j2++;
5868             }
5869           indices.new_vector (sel, 2, nelt);
5870           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5871
5872           vect1 = dr_chain[0];
5873           vect2 = dr_chain[1];
5874
5875           /* Create interleaving stmt:
5876              low = VEC_PERM_EXPR <vect1, vect2,
5877                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5878                                    j + 2, nelt + j + 2, *, ...}>  */
5879           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5880           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5881                                            vect2, perm3_mask_low);
5882           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5883
5884           vect1 = data_ref;
5885           vect2 = dr_chain[2];
5886           /* Create interleaving stmt:
5887              low = VEC_PERM_EXPR <vect1, vect2,
5888                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5889                                    6, 7, nelt + j + 2, ...}>  */
5890           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5891           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5892                                            vect2, perm3_mask_high);
5893           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5894           (*result_chain)[j] = data_ref;
5895         }
5896     }
5897   else
5898     {
5899       /* If length is not equal to 3 then only power of 2 is supported.  */
5900       gcc_assert (pow2p_hwi (length));
5901
5902       /* The encoding has 2 interleaved stepped patterns.  */
5903       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5904       vec_perm_builder sel (nelt, 2, 3);
5905       sel.quick_grow (6);
5906       for (i = 0; i < 3; i++)
5907         {
5908           sel[i * 2] = i;
5909           sel[i * 2 + 1] = i + nelt;
5910         }
5911         vec_perm_indices indices (sel, 2, nelt);
5912         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5913
5914         for (i = 0; i < 6; i++)
5915           sel[i] += exact_div (nelt, 2);
5916         indices.new_vector (sel, 2, nelt);
5917         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5918
5919         for (i = 0, n = log_length; i < n; i++)
5920           {
5921             for (j = 0; j < length/2; j++)
5922               {
5923                 vect1 = dr_chain[j];
5924                 vect2 = dr_chain[j+length/2];
5925
5926                 /* Create interleaving stmt:
5927                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5928                                                         ...}>  */
5929                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5930                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5931                                                  vect2, perm_mask_high);
5932                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5933                 (*result_chain)[2*j] = high;
5934
5935                 /* Create interleaving stmt:
5936                    low = VEC_PERM_EXPR <vect1, vect2,
5937                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5938                                          ...}>  */
5939                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5940                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5941                                                  vect2, perm_mask_low);
5942                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5943                 (*result_chain)[2*j+1] = low;
5944               }
5945             memcpy (dr_chain.address (), result_chain->address (),
5946                     length * sizeof (tree));
5947           }
5948     }
5949 }
5950
5951 /* Function vect_setup_realignment
5952
5953    This function is called when vectorizing an unaligned load using
5954    the dr_explicit_realign[_optimized] scheme.
5955    This function generates the following code at the loop prolog:
5956
5957       p = initial_addr;
5958    x  msq_init = *(floor(p));   # prolog load
5959       realignment_token = call target_builtin;
5960     loop:
5961    x  msq = phi (msq_init, ---)
5962
5963    The stmts marked with x are generated only for the case of
5964    dr_explicit_realign_optimized.
5965
5966    The code above sets up a new (vector) pointer, pointing to the first
5967    location accessed by STMT_INFO, and a "floor-aligned" load using that
5968    pointer.  It also generates code to compute the "realignment-token"
5969    (if the relevant target hook was defined), and creates a phi-node at the
5970    loop-header bb whose arguments are the result of the prolog-load (created
5971    by this function) and the result of a load that takes place in the loop
5972    (to be created by the caller to this function).
5973
5974    For the case of dr_explicit_realign_optimized:
5975    The caller to this function uses the phi-result (msq) to create the
5976    realignment code inside the loop, and sets up the missing phi argument,
5977    as follows:
5978     loop:
5979       msq = phi (msq_init, lsq)
5980       lsq = *(floor(p'));        # load in loop
5981       result = realign_load (msq, lsq, realignment_token);
5982
5983    For the case of dr_explicit_realign:
5984     loop:
5985       msq = *(floor(p));        # load in loop
5986       p' = p + (VS-1);
5987       lsq = *(floor(p'));       # load in loop
5988       result = realign_load (msq, lsq, realignment_token);
5989
5990    Input:
5991    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5992                a memory location that may be unaligned.
5993    BSI - place where new code is to be inserted.
5994    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5995                               is used.
5996
5997    Output:
5998    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5999                        target hook, if defined.
6000    Return value - the result of the loop-header phi node.  */
6001
6002 tree
6003 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6004                         gimple_stmt_iterator *gsi, tree *realignment_token,
6005                         enum dr_alignment_support alignment_support_scheme,
6006                         tree init_addr,
6007                         class loop **at_loop)
6008 {
6009   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6010   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6011   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6012   struct data_reference *dr = dr_info->dr;
6013   class loop *loop = NULL;
6014   edge pe = NULL;
6015   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6016   tree vec_dest;
6017   gimple *inc;
6018   tree ptr;
6019   tree data_ref;
6020   basic_block new_bb;
6021   tree msq_init = NULL_TREE;
6022   tree new_temp;
6023   gphi *phi_stmt;
6024   tree msq = NULL_TREE;
6025   gimple_seq stmts = NULL;
6026   bool compute_in_loop = false;
6027   bool nested_in_vect_loop = false;
6028   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6029   class loop *loop_for_initial_load = NULL;
6030
6031   if (loop_vinfo)
6032     {
6033       loop = LOOP_VINFO_LOOP (loop_vinfo);
6034       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6035     }
6036
6037   gcc_assert (alignment_support_scheme == dr_explicit_realign
6038               || alignment_support_scheme == dr_explicit_realign_optimized);
6039
6040   /* We need to generate three things:
6041      1. the misalignment computation
6042      2. the extra vector load (for the optimized realignment scheme).
6043      3. the phi node for the two vectors from which the realignment is
6044       done (for the optimized realignment scheme).  */
6045
6046   /* 1. Determine where to generate the misalignment computation.
6047
6048      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6049      calculation will be generated by this function, outside the loop (in the
6050      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6051      caller, inside the loop.
6052
6053      Background: If the misalignment remains fixed throughout the iterations of
6054      the loop, then both realignment schemes are applicable, and also the
6055      misalignment computation can be done outside LOOP.  This is because we are
6056      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6057      are a multiple of VS (the Vector Size), and therefore the misalignment in
6058      different vectorized LOOP iterations is always the same.
6059      The problem arises only if the memory access is in an inner-loop nested
6060      inside LOOP, which is now being vectorized using outer-loop vectorization.
6061      This is the only case when the misalignment of the memory access may not
6062      remain fixed throughout the iterations of the inner-loop (as explained in
6063      detail in vect_supportable_dr_alignment).  In this case, not only is the
6064      optimized realignment scheme not applicable, but also the misalignment
6065      computation (and generation of the realignment token that is passed to
6066      REALIGN_LOAD) have to be done inside the loop.
6067
6068      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6069      or not, which in turn determines if the misalignment is computed inside
6070      the inner-loop, or outside LOOP.  */
6071
6072   if (init_addr != NULL_TREE || !loop_vinfo)
6073     {
6074       compute_in_loop = true;
6075       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6076     }
6077
6078
6079   /* 2. Determine where to generate the extra vector load.
6080
6081      For the optimized realignment scheme, instead of generating two vector
6082      loads in each iteration, we generate a single extra vector load in the
6083      preheader of the loop, and in each iteration reuse the result of the
6084      vector load from the previous iteration.  In case the memory access is in
6085      an inner-loop nested inside LOOP, which is now being vectorized using
6086      outer-loop vectorization, we need to determine whether this initial vector
6087      load should be generated at the preheader of the inner-loop, or can be
6088      generated at the preheader of LOOP.  If the memory access has no evolution
6089      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6090      to be generated inside LOOP (in the preheader of the inner-loop).  */
6091
6092   if (nested_in_vect_loop)
6093     {
6094       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6095       bool invariant_in_outerloop =
6096             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6097       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6098     }
6099   else
6100     loop_for_initial_load = loop;
6101   if (at_loop)
6102     *at_loop = loop_for_initial_load;
6103
6104   tree vuse = NULL_TREE;
6105   if (loop_for_initial_load)
6106     {
6107       pe = loop_preheader_edge (loop_for_initial_load);
6108       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6109         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6110     }
6111   if (!vuse)
6112     vuse = gimple_vuse (gsi_stmt (*gsi));
6113
6114   /* 3. For the case of the optimized realignment, create the first vector
6115       load at the loop preheader.  */
6116
6117   if (alignment_support_scheme == dr_explicit_realign_optimized)
6118     {
6119       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6120       gassign *new_stmt;
6121
6122       gcc_assert (!compute_in_loop);
6123       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6124       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6125                                       loop_for_initial_load, NULL_TREE,
6126                                       &init_addr, NULL, &inc, true);
6127       if (TREE_CODE (ptr) == SSA_NAME)
6128         new_temp = copy_ssa_name (ptr);
6129       else
6130         new_temp = make_ssa_name (TREE_TYPE (ptr));
6131       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6132       tree type = TREE_TYPE (ptr);
6133       new_stmt = gimple_build_assign
6134                    (new_temp, BIT_AND_EXPR, ptr,
6135                     fold_build2 (MINUS_EXPR, type,
6136                                  build_int_cst (type, 0),
6137                                  build_int_cst (type, align)));
6138       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6139       gcc_assert (!new_bb);
6140       data_ref
6141         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6142                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6143       vect_copy_ref_info (data_ref, DR_REF (dr));
6144       new_stmt = gimple_build_assign (vec_dest, data_ref);
6145       new_temp = make_ssa_name (vec_dest, new_stmt);
6146       gimple_assign_set_lhs (new_stmt, new_temp);
6147       gimple_set_vuse (new_stmt, vuse);
6148       if (pe)
6149         {
6150           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6151           gcc_assert (!new_bb);
6152         }
6153       else
6154          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6155
6156       msq_init = gimple_assign_lhs (new_stmt);
6157     }
6158
6159   /* 4. Create realignment token using a target builtin, if available.
6160       It is done either inside the containing loop, or before LOOP (as
6161       determined above).  */
6162
6163   if (targetm.vectorize.builtin_mask_for_load)
6164     {
6165       gcall *new_stmt;
6166       tree builtin_decl;
6167
6168       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6169       if (!init_addr)
6170         {
6171           /* Generate the INIT_ADDR computation outside LOOP.  */
6172           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6173                                                             stmt_info, &stmts,
6174                                                             NULL_TREE);
6175           if (loop)
6176             {
6177               pe = loop_preheader_edge (loop);
6178               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6179               gcc_assert (!new_bb);
6180             }
6181           else
6182              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6183         }
6184
6185       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6186       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6187       vec_dest =
6188         vect_create_destination_var (scalar_dest,
6189                                      gimple_call_return_type (new_stmt));
6190       new_temp = make_ssa_name (vec_dest, new_stmt);
6191       gimple_call_set_lhs (new_stmt, new_temp);
6192
6193       if (compute_in_loop)
6194         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6195       else
6196         {
6197           /* Generate the misalignment computation outside LOOP.  */
6198           pe = loop_preheader_edge (loop);
6199           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6200           gcc_assert (!new_bb);
6201         }
6202
6203       *realignment_token = gimple_call_lhs (new_stmt);
6204
6205       /* The result of the CALL_EXPR to this builtin is determined from
6206          the value of the parameter and no global variables are touched
6207          which makes the builtin a "const" function.  Requiring the
6208          builtin to have the "const" attribute makes it unnecessary
6209          to call mark_call_clobbered.  */
6210       gcc_assert (TREE_READONLY (builtin_decl));
6211     }
6212
6213   if (alignment_support_scheme == dr_explicit_realign)
6214     return msq;
6215
6216   gcc_assert (!compute_in_loop);
6217   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6218
6219
6220   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6221
6222   pe = loop_preheader_edge (containing_loop);
6223   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6224   msq = make_ssa_name (vec_dest);
6225   phi_stmt = create_phi_node (msq, containing_loop->header);
6226   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6227
6228   return msq;
6229 }
6230
6231
6232 /* Function vect_grouped_load_supported.
6233
6234    COUNT is the size of the load group (the number of statements plus the
6235    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6236    only one statement, with a gap of COUNT - 1.
6237
6238    Returns true if a suitable permute exists.  */
6239
6240 bool
6241 vect_grouped_load_supported (tree vectype, bool single_element_p,
6242                              unsigned HOST_WIDE_INT count)
6243 {
6244   machine_mode mode = TYPE_MODE (vectype);
6245
6246   /* If this is single-element interleaving with an element distance
6247      that leaves unused vector loads around punt - we at least create
6248      very sub-optimal code in that case (and blow up memory,
6249      see PR65518).  */
6250   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6251     {
6252       if (dump_enabled_p ())
6253         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254                          "single-element interleaving not supported "
6255                          "for not adjacent vector loads\n");
6256       return false;
6257     }
6258
6259   /* vect_permute_load_chain requires the group size to be equal to 3 or
6260      be a power of two.  */
6261   if (count != 3 && exact_log2 (count) == -1)
6262     {
6263       if (dump_enabled_p ())
6264         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265                          "the size of the group of accesses"
6266                          " is not a power of 2 or not equal to 3\n");
6267       return false;
6268     }
6269
6270   /* Check that the permutation is supported.  */
6271   if (VECTOR_MODE_P (mode))
6272     {
6273       unsigned int i, j;
6274       if (count == 3)
6275         {
6276           unsigned int nelt;
6277           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6278             {
6279               if (dump_enabled_p ())
6280                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6281                                  "cannot handle groups of 3 loads for"
6282                                  " variable-length vectors\n");
6283               return false;
6284             }
6285
6286           vec_perm_builder sel (nelt, nelt, 1);
6287           sel.quick_grow (nelt);
6288           vec_perm_indices indices;
6289           unsigned int k;
6290           for (k = 0; k < 3; k++)
6291             {
6292               for (i = 0; i < nelt; i++)
6293                 if (3 * i + k < 2 * nelt)
6294                   sel[i] = 3 * i + k;
6295                 else
6296                   sel[i] = 0;
6297               indices.new_vector (sel, 2, nelt);
6298               if (!can_vec_perm_const_p (mode, mode, indices))
6299                 {
6300                   if (dump_enabled_p ())
6301                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6302                                      "shuffle of 3 loads is not supported by"
6303                                      " target\n");
6304                   return false;
6305                 }
6306               for (i = 0, j = 0; i < nelt; i++)
6307                 if (3 * i + k < 2 * nelt)
6308                   sel[i] = i;
6309                 else
6310                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6311               indices.new_vector (sel, 2, nelt);
6312               if (!can_vec_perm_const_p (mode, mode, indices))
6313                 {
6314                   if (dump_enabled_p ())
6315                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6316                                      "shuffle of 3 loads is not supported by"
6317                                      " target\n");
6318                   return false;
6319                 }
6320             }
6321           return true;
6322         }
6323       else
6324         {
6325           /* If length is not equal to 3 then only power of 2 is supported.  */
6326           gcc_assert (pow2p_hwi (count));
6327           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6328
6329           /* The encoding has a single stepped pattern.  */
6330           vec_perm_builder sel (nelt, 1, 3);
6331           sel.quick_grow (3);
6332           for (i = 0; i < 3; i++)
6333             sel[i] = i * 2;
6334           vec_perm_indices indices (sel, 2, nelt);
6335           if (can_vec_perm_const_p (mode, mode, indices))
6336             {
6337               for (i = 0; i < 3; i++)
6338                 sel[i] = i * 2 + 1;
6339               indices.new_vector (sel, 2, nelt);
6340               if (can_vec_perm_const_p (mode, mode, indices))
6341                 return true;
6342             }
6343         }
6344     }
6345
6346   if (dump_enabled_p ())
6347     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6348                      "extract even/odd not supported by target\n");
6349   return false;
6350 }
6351
6352 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6353    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6354
6355 internal_fn
6356 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6357                            bool masked_p)
6358 {
6359   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6360                                     vec_mask_len_load_lanes_optab, vectype,
6361                                     count))
6362     return IFN_MASK_LEN_LOAD_LANES;
6363   else if (masked_p)
6364     {
6365       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6366                                         vec_mask_load_lanes_optab, vectype,
6367                                         count))
6368         return IFN_MASK_LOAD_LANES;
6369     }
6370   else
6371     {
6372       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6373                                         vectype, count))
6374         return IFN_LOAD_LANES;
6375     }
6376   return IFN_LAST;
6377 }
6378
6379 /* Function vect_permute_load_chain.
6380
6381    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6382    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6383    the input data correctly.  Return the final references for loads in
6384    RESULT_CHAIN.
6385
6386    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6387    The input is 4 vectors each containing 8 elements. We assign a number to each
6388    element, the input sequence is:
6389
6390    1st vec:   0  1  2  3  4  5  6  7
6391    2nd vec:   8  9 10 11 12 13 14 15
6392    3rd vec:  16 17 18 19 20 21 22 23
6393    4th vec:  24 25 26 27 28 29 30 31
6394
6395    The output sequence should be:
6396
6397    1st vec:  0 4  8 12 16 20 24 28
6398    2nd vec:  1 5  9 13 17 21 25 29
6399    3rd vec:  2 6 10 14 18 22 26 30
6400    4th vec:  3 7 11 15 19 23 27 31
6401
6402    i.e., the first output vector should contain the first elements of each
6403    interleaving group, etc.
6404
6405    We use extract_even/odd instructions to create such output.  The input of
6406    each extract_even/odd operation is two vectors
6407    1st vec    2nd vec
6408    0 1 2 3    4 5 6 7
6409
6410    and the output is the vector of extracted even/odd elements.  The output of
6411    extract_even will be:   0 2 4 6
6412    and of extract_odd:     1 3 5 7
6413
6414
6415    The permutation is done in log LENGTH stages.  In each stage extract_even
6416    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6417    their order.  In our example,
6418
6419    E1: extract_even (1st vec, 2nd vec)
6420    E2: extract_odd (1st vec, 2nd vec)
6421    E3: extract_even (3rd vec, 4th vec)
6422    E4: extract_odd (3rd vec, 4th vec)
6423
6424    The output for the first stage will be:
6425
6426    E1:  0  2  4  6  8 10 12 14
6427    E2:  1  3  5  7  9 11 13 15
6428    E3: 16 18 20 22 24 26 28 30
6429    E4: 17 19 21 23 25 27 29 31
6430
6431    In order to proceed and create the correct sequence for the next stage (or
6432    for the correct output, if the second stage is the last one, as in our
6433    example), we first put the output of extract_even operation and then the
6434    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6435    The input for the second stage is:
6436
6437    1st vec (E1):  0  2  4  6  8 10 12 14
6438    2nd vec (E3): 16 18 20 22 24 26 28 30
6439    3rd vec (E2):  1  3  5  7  9 11 13 15
6440    4th vec (E4): 17 19 21 23 25 27 29 31
6441
6442    The output of the second stage:
6443
6444    E1: 0 4  8 12 16 20 24 28
6445    E2: 2 6 10 14 18 22 26 30
6446    E3: 1 5  9 13 17 21 25 29
6447    E4: 3 7 11 15 19 23 27 31
6448
6449    And RESULT_CHAIN after reordering:
6450
6451    1st vec (E1):  0 4  8 12 16 20 24 28
6452    2nd vec (E3):  1 5  9 13 17 21 25 29
6453    3rd vec (E2):  2 6 10 14 18 22 26 30
6454    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6455
6456 static void
6457 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6458                          unsigned int length,
6459                          stmt_vec_info stmt_info,
6460                          gimple_stmt_iterator *gsi,
6461                          vec<tree> *result_chain)
6462 {
6463   tree data_ref, first_vect, second_vect;
6464   tree perm_mask_even, perm_mask_odd;
6465   tree perm3_mask_low, perm3_mask_high;
6466   gimple *perm_stmt;
6467   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6468   unsigned int i, j, log_length = exact_log2 (length);
6469
6470   result_chain->quick_grow (length);
6471   memcpy (result_chain->address (), dr_chain.address (),
6472           length * sizeof (tree));
6473
6474   if (length == 3)
6475     {
6476       /* vect_grouped_load_supported ensures that this is constant.  */
6477       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6478       unsigned int k;
6479
6480       vec_perm_builder sel (nelt, nelt, 1);
6481       sel.quick_grow (nelt);
6482       vec_perm_indices indices;
6483       for (k = 0; k < 3; k++)
6484         {
6485           for (i = 0; i < nelt; i++)
6486             if (3 * i + k < 2 * nelt)
6487               sel[i] = 3 * i + k;
6488             else
6489               sel[i] = 0;
6490           indices.new_vector (sel, 2, nelt);
6491           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6492
6493           for (i = 0, j = 0; i < nelt; i++)
6494             if (3 * i + k < 2 * nelt)
6495               sel[i] = i;
6496             else
6497               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6498           indices.new_vector (sel, 2, nelt);
6499           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6500
6501           first_vect = dr_chain[0];
6502           second_vect = dr_chain[1];
6503
6504           /* Create interleaving stmt (low part of):
6505              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6506                                                              ...}>  */
6507           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6508           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6509                                            second_vect, perm3_mask_low);
6510           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6511
6512           /* Create interleaving stmt (high part of):
6513              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6514                                                               ...}>  */
6515           first_vect = data_ref;
6516           second_vect = dr_chain[2];
6517           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6518           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6519                                            second_vect, perm3_mask_high);
6520           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6521           (*result_chain)[k] = data_ref;
6522         }
6523     }
6524   else
6525     {
6526       /* If length is not equal to 3 then only power of 2 is supported.  */
6527       gcc_assert (pow2p_hwi (length));
6528
6529       /* The encoding has a single stepped pattern.  */
6530       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6531       vec_perm_builder sel (nelt, 1, 3);
6532       sel.quick_grow (3);
6533       for (i = 0; i < 3; ++i)
6534         sel[i] = i * 2;
6535       vec_perm_indices indices (sel, 2, nelt);
6536       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6537
6538       for (i = 0; i < 3; ++i)
6539         sel[i] = i * 2 + 1;
6540       indices.new_vector (sel, 2, nelt);
6541       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6542
6543       for (i = 0; i < log_length; i++)
6544         {
6545           for (j = 0; j < length; j += 2)
6546             {
6547               first_vect = dr_chain[j];
6548               second_vect = dr_chain[j+1];
6549
6550               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6551               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6552               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6553                                                first_vect, second_vect,
6554                                                perm_mask_even);
6555               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6556               (*result_chain)[j/2] = data_ref;
6557
6558               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6559               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6560               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6561                                                first_vect, second_vect,
6562                                                perm_mask_odd);
6563               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6564               (*result_chain)[j/2+length/2] = data_ref;
6565             }
6566           memcpy (dr_chain.address (), result_chain->address (),
6567                   length * sizeof (tree));
6568         }
6569     }
6570 }
6571
6572 /* Function vect_shift_permute_load_chain.
6573
6574    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6575    sequence of stmts to reorder the input data accordingly.
6576    Return the final references for loads in RESULT_CHAIN.
6577    Return true if successed, false otherwise.
6578
6579    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6580    The input is 3 vectors each containing 8 elements.  We assign a
6581    number to each element, the input sequence is:
6582
6583    1st vec:   0  1  2  3  4  5  6  7
6584    2nd vec:   8  9 10 11 12 13 14 15
6585    3rd vec:  16 17 18 19 20 21 22 23
6586
6587    The output sequence should be:
6588
6589    1st vec:  0 3 6  9 12 15 18 21
6590    2nd vec:  1 4 7 10 13 16 19 22
6591    3rd vec:  2 5 8 11 14 17 20 23
6592
6593    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6594
6595    First we shuffle all 3 vectors to get correct elements order:
6596
6597    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6598    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6599    3rd vec:  (16 19 22) (17 20 23) (18 21)
6600
6601    Next we unite and shift vector 3 times:
6602
6603    1st step:
6604      shift right by 6 the concatenation of:
6605      "1st vec" and  "2nd vec"
6606        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6607      "2nd vec" and  "3rd vec"
6608        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6609      "3rd vec" and  "1st vec"
6610        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6611                              | New vectors                   |
6612
6613      So that now new vectors are:
6614
6615      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6616      2nd vec:  (10 13) (16 19 22) (17 20 23)
6617      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6618
6619    2nd step:
6620      shift right by 5 the concatenation of:
6621      "1st vec" and  "3rd vec"
6622        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6623      "2nd vec" and  "1st vec"
6624        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6625      "3rd vec" and  "2nd vec"
6626        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6627                           | New vectors                   |
6628
6629      So that now new vectors are:
6630
6631      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6632      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6633      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6634
6635    3rd step:
6636      shift right by 5 the concatenation of:
6637      "1st vec" and  "1st vec"
6638        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6639      shift right by 3 the concatenation of:
6640      "2nd vec" and  "2nd vec"
6641                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6642                           | New vectors                   |
6643
6644      So that now all vectors are READY:
6645      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6646      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6647      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6648
6649    This algorithm is faster than one in vect_permute_load_chain if:
6650      1.  "shift of a concatination" is faster than general permutation.
6651          This is usually so.
6652      2.  The TARGET machine can't execute vector instructions in parallel.
6653          This is because each step of the algorithm depends on previous.
6654          The algorithm in vect_permute_load_chain is much more parallel.
6655
6656    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6657 */
6658
6659 static bool
6660 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6661                                unsigned int length,
6662                                stmt_vec_info stmt_info,
6663                                gimple_stmt_iterator *gsi,
6664                                vec<tree> *result_chain)
6665 {
6666   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6667   tree perm2_mask1, perm2_mask2, perm3_mask;
6668   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6669   gimple *perm_stmt;
6670
6671   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6672   machine_mode vmode = TYPE_MODE (vectype);
6673   unsigned int i;
6674   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6675
6676   unsigned HOST_WIDE_INT nelt, vf;
6677   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6678       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6679     /* Not supported for variable-length vectors.  */
6680     return false;
6681
6682   vec_perm_builder sel (nelt, nelt, 1);
6683   sel.quick_grow (nelt);
6684
6685   result_chain->quick_grow (length);
6686   memcpy (result_chain->address (), dr_chain.address (),
6687           length * sizeof (tree));
6688
6689   if (pow2p_hwi (length) && vf > 4)
6690     {
6691       unsigned int j, log_length = exact_log2 (length);
6692       for (i = 0; i < nelt / 2; ++i)
6693         sel[i] = i * 2;
6694       for (i = 0; i < nelt / 2; ++i)
6695         sel[nelt / 2 + i] = i * 2 + 1;
6696       vec_perm_indices indices (sel, 2, nelt);
6697       if (!can_vec_perm_const_p (vmode, vmode, indices))
6698         {
6699           if (dump_enabled_p ())
6700             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6701                              "shuffle of 2 fields structure is not \
6702                               supported by target\n");
6703           return false;
6704         }
6705       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6706
6707       for (i = 0; i < nelt / 2; ++i)
6708         sel[i] = i * 2 + 1;
6709       for (i = 0; i < nelt / 2; ++i)
6710         sel[nelt / 2 + i] = i * 2;
6711       indices.new_vector (sel, 2, nelt);
6712       if (!can_vec_perm_const_p (vmode, vmode, indices))
6713         {
6714           if (dump_enabled_p ())
6715             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716                              "shuffle of 2 fields structure is not \
6717                               supported by target\n");
6718           return false;
6719         }
6720       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6721
6722       /* Generating permutation constant to shift all elements.
6723          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6724       for (i = 0; i < nelt; i++)
6725         sel[i] = nelt / 2 + i;
6726       indices.new_vector (sel, 2, nelt);
6727       if (!can_vec_perm_const_p (vmode, vmode, indices))
6728         {
6729           if (dump_enabled_p ())
6730             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6731                              "shift permutation is not supported by target\n");
6732           return false;
6733         }
6734       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6735
6736       /* Generating permutation constant to select vector from 2.
6737          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6738       for (i = 0; i < nelt / 2; i++)
6739         sel[i] = i;
6740       for (i = nelt / 2; i < nelt; i++)
6741         sel[i] = nelt + i;
6742       indices.new_vector (sel, 2, nelt);
6743       if (!can_vec_perm_const_p (vmode, vmode, indices))
6744         {
6745           if (dump_enabled_p ())
6746             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6747                              "select is not supported by target\n");
6748           return false;
6749         }
6750       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6751
6752       for (i = 0; i < log_length; i++)
6753         {
6754           for (j = 0; j < length; j += 2)
6755             {
6756               first_vect = dr_chain[j];
6757               second_vect = dr_chain[j + 1];
6758
6759               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6760               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6761                                                first_vect, first_vect,
6762                                                perm2_mask1);
6763               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6764               vect[0] = data_ref;
6765
6766               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6767               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6768                                                second_vect, second_vect,
6769                                                perm2_mask2);
6770               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6771               vect[1] = data_ref;
6772
6773               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6774               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6775                                                vect[0], vect[1], shift1_mask);
6776               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6777               (*result_chain)[j/2 + length/2] = data_ref;
6778
6779               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6780               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6781                                                vect[0], vect[1], select_mask);
6782               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6783               (*result_chain)[j/2] = data_ref;
6784             }
6785           memcpy (dr_chain.address (), result_chain->address (),
6786                   length * sizeof (tree));
6787         }
6788       return true;
6789     }
6790   if (length == 3 && vf > 2)
6791     {
6792       unsigned int k = 0, l = 0;
6793
6794       /* Generating permutation constant to get all elements in rigth order.
6795          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6796       for (i = 0; i < nelt; i++)
6797         {
6798           if (3 * k + (l % 3) >= nelt)
6799             {
6800               k = 0;
6801               l += (3 - (nelt % 3));
6802             }
6803           sel[i] = 3 * k + (l % 3);
6804           k++;
6805         }
6806       vec_perm_indices indices (sel, 2, nelt);
6807       if (!can_vec_perm_const_p (vmode, vmode, indices))
6808         {
6809           if (dump_enabled_p ())
6810             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811                              "shuffle of 3 fields structure is not \
6812                               supported by target\n");
6813           return false;
6814         }
6815       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6816
6817       /* Generating permutation constant to shift all elements.
6818          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6819       for (i = 0; i < nelt; i++)
6820         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6821       indices.new_vector (sel, 2, nelt);
6822       if (!can_vec_perm_const_p (vmode, vmode, indices))
6823         {
6824           if (dump_enabled_p ())
6825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6826                              "shift permutation is not supported by target\n");
6827           return false;
6828         }
6829       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6830
6831       /* Generating permutation constant to shift all elements.
6832          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6833       for (i = 0; i < nelt; i++)
6834         sel[i] = 2 * (nelt / 3) + 1 + i;
6835       indices.new_vector (sel, 2, nelt);
6836       if (!can_vec_perm_const_p (vmode, vmode, indices))
6837         {
6838           if (dump_enabled_p ())
6839             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6840                              "shift permutation is not supported by target\n");
6841           return false;
6842         }
6843       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6844
6845       /* Generating permutation constant to shift all elements.
6846          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6847       for (i = 0; i < nelt; i++)
6848         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6849       indices.new_vector (sel, 2, nelt);
6850       if (!can_vec_perm_const_p (vmode, vmode, indices))
6851         {
6852           if (dump_enabled_p ())
6853             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6854                              "shift permutation is not supported by target\n");
6855           return false;
6856         }
6857       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6858
6859       /* Generating permutation constant to shift all elements.
6860          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6861       for (i = 0; i < nelt; i++)
6862         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6863       indices.new_vector (sel, 2, nelt);
6864       if (!can_vec_perm_const_p (vmode, vmode, indices))
6865         {
6866           if (dump_enabled_p ())
6867             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868                              "shift permutation is not supported by target\n");
6869           return false;
6870         }
6871       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6872
6873       for (k = 0; k < 3; k++)
6874         {
6875           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6876           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6877                                            dr_chain[k], dr_chain[k],
6878                                            perm3_mask);
6879           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6880           vect[k] = data_ref;
6881         }
6882
6883       for (k = 0; k < 3; k++)
6884         {
6885           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6886           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6887                                            vect[k % 3], vect[(k + 1) % 3],
6888                                            shift1_mask);
6889           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6890           vect_shift[k] = data_ref;
6891         }
6892
6893       for (k = 0; k < 3; k++)
6894         {
6895           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6896           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6897                                            vect_shift[(4 - k) % 3],
6898                                            vect_shift[(3 - k) % 3],
6899                                            shift2_mask);
6900           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6901           vect[k] = data_ref;
6902         }
6903
6904       (*result_chain)[3 - (nelt % 3)] = vect[2];
6905
6906       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6907       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6908                                        vect[0], shift3_mask);
6909       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6910       (*result_chain)[nelt % 3] = data_ref;
6911
6912       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6913       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6914                                        vect[1], shift4_mask);
6915       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6916       (*result_chain)[0] = data_ref;
6917       return true;
6918     }
6919   return false;
6920 }
6921
6922 /* Function vect_transform_grouped_load.
6923
6924    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6925    to perform their permutation and ascribe the result vectorized statements to
6926    the scalar statements.
6927 */
6928
6929 void
6930 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6931                              vec<tree> dr_chain,
6932                              int size, gimple_stmt_iterator *gsi)
6933 {
6934   machine_mode mode;
6935   vec<tree> result_chain = vNULL;
6936
6937   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6938      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6939      vectors, that are ready for vector computation.  */
6940   result_chain.create (size);
6941
6942   /* If reassociation width for vector type is 2 or greater target machine can
6943      execute 2 or more vector instructions in parallel.  Otherwise try to
6944      get chain for loads group using vect_shift_permute_load_chain.  */
6945   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6946   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6947       || pow2p_hwi (size)
6948       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6949                                          gsi, &result_chain))
6950     vect_permute_load_chain (vinfo, dr_chain,
6951                              size, stmt_info, gsi, &result_chain);
6952   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6953   result_chain.release ();
6954 }
6955
6956 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6957    generated as part of the vectorization of STMT_INFO.  Assign the statement
6958    for each vector to the associated scalar statement.  */
6959
6960 void
6961 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6962                                   vec<tree> result_chain)
6963 {
6964   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6965   unsigned int i, gap_count;
6966   tree tmp_data_ref;
6967
6968   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6969      Since we scan the chain starting from it's first node, their order
6970      corresponds the order of data-refs in RESULT_CHAIN.  */
6971   stmt_vec_info next_stmt_info = first_stmt_info;
6972   gap_count = 1;
6973   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6974     {
6975       if (!next_stmt_info)
6976         break;
6977
6978       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6979        code elimination pass later.  No need to check for the first stmt in
6980        the group, since it always exists.
6981        DR_GROUP_GAP is the number of steps in elements from the previous
6982        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6983        correspond to the gaps.  */
6984       if (next_stmt_info != first_stmt_info
6985           && gap_count < DR_GROUP_GAP (next_stmt_info))
6986         {
6987           gap_count++;
6988           continue;
6989         }
6990
6991       /* ???  The following needs cleanup after the removal of
6992          DR_GROUP_SAME_DR_STMT.  */
6993       if (next_stmt_info)
6994         {
6995           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6996           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6997              copies, and we put the new vector statement last.  */
6998           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
6999
7000           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7001           gap_count = 1;
7002         }
7003     }
7004 }
7005
7006 /* Function vect_force_dr_alignment_p.
7007
7008    Returns whether the alignment of a DECL can be forced to be aligned
7009    on ALIGNMENT bit boundary.  */
7010
7011 bool
7012 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7013 {
7014   if (!VAR_P (decl))
7015     return false;
7016
7017   if (decl_in_symtab_p (decl)
7018       && !symtab_node::get (decl)->can_increase_alignment_p ())
7019     return false;
7020
7021   if (TREE_STATIC (decl))
7022     return (known_le (alignment,
7023                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7024   else
7025     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7026 }
7027
7028 /* Return whether the data reference DR_INFO is supported with respect to its
7029    alignment.
7030    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7031    it is aligned, i.e., check if it is possible to vectorize it with different
7032    alignment.  */
7033
7034 enum dr_alignment_support
7035 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7036                                tree vectype, int misalignment)
7037 {
7038   data_reference *dr = dr_info->dr;
7039   stmt_vec_info stmt_info = dr_info->stmt;
7040   machine_mode mode = TYPE_MODE (vectype);
7041   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7042   class loop *vect_loop = NULL;
7043   bool nested_in_vect_loop = false;
7044
7045   if (misalignment == 0)
7046     return dr_aligned;
7047
7048   /* For now assume all conditional loads/stores support unaligned
7049      access without any special code.  */
7050   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7051     if (gimple_call_internal_p (stmt)
7052         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7053             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7054       return dr_unaligned_supported;
7055
7056   if (loop_vinfo)
7057     {
7058       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7059       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7060     }
7061
7062   /* Possibly unaligned access.  */
7063
7064   /* We can choose between using the implicit realignment scheme (generating
7065      a misaligned_move stmt) and the explicit realignment scheme (generating
7066      aligned loads with a REALIGN_LOAD).  There are two variants to the
7067      explicit realignment scheme: optimized, and unoptimized.
7068      We can optimize the realignment only if the step between consecutive
7069      vector loads is equal to the vector size.  Since the vector memory
7070      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7071      is guaranteed that the misalignment amount remains the same throughout the
7072      execution of the vectorized loop.  Therefore, we can create the
7073      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7074      at the loop preheader.
7075
7076      However, in the case of outer-loop vectorization, when vectorizing a
7077      memory access in the inner-loop nested within the LOOP that is now being
7078      vectorized, while it is guaranteed that the misalignment of the
7079      vectorized memory access will remain the same in different outer-loop
7080      iterations, it is *not* guaranteed that is will remain the same throughout
7081      the execution of the inner-loop.  This is because the inner-loop advances
7082      with the original scalar step (and not in steps of VS).  If the inner-loop
7083      step happens to be a multiple of VS, then the misalignment remains fixed
7084      and we can use the optimized realignment scheme.  For example:
7085
7086       for (i=0; i<N; i++)
7087         for (j=0; j<M; j++)
7088           s += a[i+j];
7089
7090      When vectorizing the i-loop in the above example, the step between
7091      consecutive vector loads is 1, and so the misalignment does not remain
7092      fixed across the execution of the inner-loop, and the realignment cannot
7093      be optimized (as illustrated in the following pseudo vectorized loop):
7094
7095       for (i=0; i<N; i+=4)
7096         for (j=0; j<M; j++){
7097           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7098                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7099                          // (assuming that we start from an aligned address).
7100           }
7101
7102      We therefore have to use the unoptimized realignment scheme:
7103
7104       for (i=0; i<N; i+=4)
7105           for (j=k; j<M; j+=4)
7106           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7107                            // that the misalignment of the initial address is
7108                            // 0).
7109
7110      The loop can then be vectorized as follows:
7111
7112       for (k=0; k<4; k++){
7113         rt = get_realignment_token (&vp[k]);
7114         for (i=0; i<N; i+=4){
7115           v1 = vp[i+k];
7116           for (j=k; j<M; j+=4){
7117             v2 = vp[i+j+VS-1];
7118             va = REALIGN_LOAD <v1,v2,rt>;
7119             vs += va;
7120             v1 = v2;
7121           }
7122         }
7123     } */
7124
7125   if (DR_IS_READ (dr))
7126     {
7127       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7128           && (!targetm.vectorize.builtin_mask_for_load
7129               || targetm.vectorize.builtin_mask_for_load ()))
7130         {
7131           /* If we are doing SLP then the accesses need not have the
7132              same alignment, instead it depends on the SLP group size.  */
7133           if (loop_vinfo
7134               && STMT_SLP_TYPE (stmt_info)
7135               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7136                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7137                                   * (DR_GROUP_SIZE
7138                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7139                                   TYPE_VECTOR_SUBPARTS (vectype))))
7140             ;
7141           else if (!loop_vinfo
7142                    || (nested_in_vect_loop
7143                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7144                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7145             return dr_explicit_realign;
7146           else
7147             return dr_explicit_realign_optimized;
7148         }
7149     }
7150
7151   bool is_packed = false;
7152   tree type = TREE_TYPE (DR_REF (dr));
7153   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7154     is_packed = not_size_aligned (DR_REF (dr));
7155   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7156                                                      is_packed))
7157     return dr_unaligned_supported;
7158
7159   /* Unsupported.  */
7160   return dr_unaligned_unsupported;
7161 }