gcc/tree-vect-data-refs.cc

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "tree-cfg.h"
  53 #include "tree-hash-traits.h"
  54 #include "vec-perm-indices.h"
  55 #include "internal-fn.h"
  56 #include "gimple-fold.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100 /* Helper function to identify a simd clone call.  If this is a call to a
 101    function with simd clones then return the corresponding cgraph_node,
 102    otherwise return NULL.  */
 103
 104 static cgraph_node*
 105 simd_clone_call_p (gimple *stmt)
 106 {
 107   gcall *call = dyn_cast <gcall *> (stmt);
 108   if (!call)
 109     return NULL;
 110
 111   tree fndecl = NULL_TREE;
 112   if (gimple_call_internal_p (call, IFN_MASK_CALL))
 113     fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
 114   else
 115     fndecl = gimple_call_fndecl (stmt);
 116
 117   if (fndecl == NULL_TREE)
 118     return NULL;
 119
 120   cgraph_node *node = cgraph_node::get (fndecl);
 121   if (node && node->simd_clones != NULL)
 122     return node;
 123
 124   return NULL;
 125 }
 126
 127
 128
 129 /* Return the smallest scalar part of STMT_INFO.
 130    This is used to determine the vectype of the stmt.  We generally set the
 131    vectype according to the type of the result (lhs).  For stmts whose
 132    result-type is different than the type of the arguments (e.g., demotion,
 133    promotion), vectype will be reset appropriately (later).  Note that we have
 134    to visit the smallest datatype in this function, because that determines the
 135    VF.  If the smallest datatype in the loop is present only as the rhs of a
 136    promotion operation - we'd miss it.
 137    Such a case, where a variable of this datatype does not appear in the lhs
 138    anywhere in the loop, can only occur if it's an invariant: e.g.:
 139    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 140    invariant motion.  However, we cannot rely on invariant motion to always
 141    take invariants out of the loop, and so in the case of promotion we also
 142    have to check the rhs.
 143    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 144    types.  */
 145
 146 tree
 147 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 148 {
 149   HOST_WIDE_INT lhs, rhs;
 150
 151   /* During the analysis phase, this function is called on arbitrary
 152      statements that might not have scalar results.  */
 153   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 154     return scalar_type;
 155
 156   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 157
 158   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 159   if (assign)
 160     {
 161       scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
 162       if (gimple_assign_cast_p (assign)
 163           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 164           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 165           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 166           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 167           || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
 168         {
 169           tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 170
 171           rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 172           if (rhs < lhs)
 173             scalar_type = rhs_type;
 174         }
 175     }
 176   else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
 177     {
 178       auto clone = node->simd_clones->simdclone;
 179       for (unsigned int i = 0; i < clone->nargs; ++i)
 180         {
 181           if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
 182             {
 183               tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
 184               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
 185               if (rhs < lhs)
 186                 {
 187                   scalar_type = arg_scalar_type;
 188                   lhs = rhs;
 189                 }
 190             }
 191         }
 192     }
 193   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 194     {
 195       unsigned int i = 0;
 196       if (gimple_call_internal_p (call))
 197         {
 198           internal_fn ifn = gimple_call_internal_fn (call);
 199           if (internal_load_fn_p (ifn))
 200             /* For loads the LHS type does the trick.  */
 201             i = ~0U;
 202           else if (internal_store_fn_p (ifn))
 203             {
 204               /* For stores use the tyep of the stored value.  */
 205               i = internal_fn_stored_value_index (ifn);
 206               scalar_type = TREE_TYPE (gimple_call_arg (call, i));
 207               i = ~0U;
 208             }
 209           else if (internal_fn_mask_index (ifn) == 0)
 210             i = 1;
 211         }
 212       if (i < gimple_call_num_args (call))
 213         {
 214           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 215           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 216             {
 217               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 218               if (rhs < lhs)
 219                 scalar_type = rhs_type;
 220             }
 221         }
 222     }
 223
 224   return scalar_type;
 225 }
 226
 227
 228 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 229    tested at run-time.  Return TRUE if DDR was successfully inserted.
 230    Return false if versioning is not supported.  */
 231
 232 static opt_result
 233 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 234 {
 235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236
 237   if ((unsigned) param_vect_max_version_for_alias_checks == 0)
 238     return opt_result::failure_at (vect_location,
 239                                    "will not create alias checks, as"
 240                                    " --param vect-max-version-for-alias-checks"
 241                                    " == 0\n");
 242
 243   opt_result res
 244     = runtime_alias_check_p (ddr, loop,
 245                              optimize_loop_nest_for_speed_p (loop));
 246   if (!res)
 247     return res;
 248
 249   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 250   return opt_result::success ();
 251 }
 252
 253 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 254
 255 static void
 256 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 257 {
 258   const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 259   for (unsigned int i = 0; i < checks.length(); ++i)
 260     if (checks[i] == value)
 261       return;
 262
 263   if (dump_enabled_p ())
 264     dump_printf_loc (MSG_NOTE, vect_location,
 265                      "need run-time check that %T is nonzero\n",
 266                      value);
 267   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 268 }
 269
 270 /* Return true if we know that the order of vectorized DR_INFO_A and
 271    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 272    DR_INFO_B.  At least one of the accesses is a write.  */
 273
 274 static bool
 275 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 276 {
 277   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 278   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 279
 280   /* Single statements are always kept in their original order.  */
 281   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 282       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 283     return true;
 284
 285   /* STMT_A and STMT_B belong to overlapping groups.  All loads are
 286      emitted at the position of the first scalar load.
 287      Stores in a group are emitted at the position of the last scalar store.
 288      Compute that position and check whether the resulting order matches
 289      the current one.  */
 290   stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 291   if (il_a)
 292     {
 293       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 294         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 295              s = DR_GROUP_NEXT_ELEMENT (s))
 296           il_a = get_later_stmt (il_a, s);
 297       else /* DR_IS_READ */
 298         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 299              s = DR_GROUP_NEXT_ELEMENT (s))
 300           if (get_later_stmt (il_a, s) == il_a)
 301             il_a = s;
 302     }
 303   else
 304     il_a = stmtinfo_a;
 305   stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 306   if (il_b)
 307     {
 308       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 309         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 310              s = DR_GROUP_NEXT_ELEMENT (s))
 311           il_b = get_later_stmt (il_b, s);
 312       else /* DR_IS_READ */
 313         for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 314              s = DR_GROUP_NEXT_ELEMENT (s))
 315           if (get_later_stmt (il_b, s) == il_b)
 316             il_b = s;
 317     }
 318   else
 319     il_b = stmtinfo_b;
 320   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 321   return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
 322 }
 323
 324 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 325    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 326    distances.  These distances are conservatively correct but they don't
 327    reflect a guaranteed dependence.
 328
 329    Return true if this function does all the work necessary to avoid
 330    an alias or false if the caller should use the dependence distances
 331    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 332    the depth of the loop described by LOOP_VINFO and the other arguments
 333    are as for vect_analyze_data_ref_dependence.  */
 334
 335 static bool
 336 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 337                                        loop_vec_info loop_vinfo,
 338                                        int loop_depth, unsigned int *max_vf)
 339 {
 340   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 341   for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
 342     {
 343       int dist = dist_v[loop_depth];
 344       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 345         {
 346           /* If the user asserted safelen >= DIST consecutive iterations
 347              can be executed concurrently, assume independence.
 348
 349              ??? An alternative would be to add the alias check even
 350              in this case, and vectorize the fallback loop with the
 351              maximum VF set to safelen.  However, if the user has
 352              explicitly given a length, it's less likely that that
 353              would be a win.  */
 354           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 355             {
 356               if ((unsigned int) loop->safelen < *max_vf)
 357                 *max_vf = loop->safelen;
 358               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 359               continue;
 360             }
 361
 362           /* For dependence distances of 2 or more, we have the option
 363              of limiting VF or checking for an alias at runtime.
 364              Prefer to check at runtime if we can, to avoid limiting
 365              the VF unnecessarily when the bases are in fact independent.
 366
 367              Note that the alias checks will be removed if the VF ends up
 368              being small enough.  */
 369           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 370           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 371           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 372                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 373                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 374         }
 375     }
 376   return true;
 377 }
 378
 379
 380 /* Function vect_analyze_data_ref_dependence.
 381
 382    FIXME: I needed to change the sense of the returned flag.
 383
 384    Return FALSE if there (might) exist a dependence between a memory-reference
 385    DRA and a memory-reference DRB.  When versioning for alias may check a
 386    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 387    the data dependence.  */
 388
 389 static opt_result
 390 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 391                                   loop_vec_info loop_vinfo,
 392                                   unsigned int *max_vf)
 393 {
 394   unsigned int i;
 395   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 396   struct data_reference *dra = DDR_A (ddr);
 397   struct data_reference *drb = DDR_B (ddr);
 398   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 399   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 400   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 401   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 402   lambda_vector dist_v;
 403   unsigned int loop_depth;
 404
 405   /* If user asserted safelen consecutive iterations can be
 406      executed concurrently, assume independence.  */
 407   auto apply_safelen = [&]()
 408     {
 409       if (loop->safelen >= 2)
 410         {
 411           if ((unsigned int) loop->safelen < *max_vf)
 412             *max_vf = loop->safelen;
 413           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 414           return true;
 415         }
 416       return false;
 417     };
 418
 419   /* In loop analysis all data references should be vectorizable.  */
 420   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 421       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 422     gcc_unreachable ();
 423
 424   /* Independent data accesses.  */
 425   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 426     return opt_result::success ();
 427
 428   if (dra == drb
 429       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 430     return opt_result::success ();
 431
 432   /* We do not have to consider dependences between accesses that belong
 433      to the same group, unless the stride could be smaller than the
 434      group size.  */
 435   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 436       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 437           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 438       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 439     return opt_result::success ();
 440
 441   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 442      least two scalar iterations, there is always also a true dependence.
 443      As the vectorizer does not re-order loads and stores we can ignore
 444      the anti-dependence if TBAA can disambiguate both DRs similar to the
 445      case with known negative distance anti-dependences (positive
 446      distance anti-dependences would violate TBAA constraints).  */
 447   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 448        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 449       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 450                                  get_alias_set (DR_REF (drb))))
 451     return opt_result::success ();
 452
 453   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 454       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 455     {
 456       if (apply_safelen ())
 457         return opt_result::success ();
 458
 459       return opt_result::failure_at
 460         (stmtinfo_a->stmt,
 461          "possible alias involving gather/scatter between %T and %T\n",
 462          DR_REF (dra), DR_REF (drb));
 463     }
 464
 465   /* Unknown data dependence.  */
 466   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 467     {
 468       if (apply_safelen ())
 469         return opt_result::success ();
 470
 471       if (dump_enabled_p ())
 472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 473                          "versioning for alias required: "
 474                          "can't determine dependence between %T and %T\n",
 475                          DR_REF (dra), DR_REF (drb));
 476
 477       /* Add to list of ddrs that need to be tested at run-time.  */
 478       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 479     }
 480
 481   /* Known data dependence.  */
 482   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 483     {
 484       if (apply_safelen ())
 485         return opt_result::success ();
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 489                          "versioning for alias required: "
 490                          "bad dist vector for %T and %T\n",
 491                          DR_REF (dra), DR_REF (drb));
 492       /* Add to list of ddrs that need to be tested at run-time.  */
 493       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 494     }
 495
 496   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 497
 498   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 499       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 500                                                 loop_depth, max_vf))
 501     return opt_result::success ();
 502
 503   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 504     {
 505       int dist = dist_v[loop_depth];
 506
 507       if (dump_enabled_p ())
 508         dump_printf_loc (MSG_NOTE, vect_location,
 509                          "dependence distance  = %d.\n", dist);
 510
 511       if (dist == 0)
 512         {
 513           if (dump_enabled_p ())
 514             dump_printf_loc (MSG_NOTE, vect_location,
 515                              "dependence distance == 0 between %T and %T\n",
 516                              DR_REF (dra), DR_REF (drb));
 517
 518           /* When we perform grouped accesses and perform implicit CSE
 519              by detecting equal accesses and doing disambiguation with
 520              runtime alias tests like for
 521                 .. = a[i];
 522                 .. = a[i+1];
 523                 a[i] = ..;
 524                 a[i+1] = ..;
 525                 *p = ..;
 526                 .. = a[i];
 527                 .. = a[i+1];
 528              where we will end up loading { a[i], a[i+1] } once, make
 529              sure that inserting group loads before the first load and
 530              stores after the last store will do the right thing.
 531              Similar for groups like
 532                 a[i] = ...;
 533                 ... = a[i];
 534                 a[i+1] = ...;
 535              where loads from the group interleave with the store.  */
 536           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 537             return opt_result::failure_at (stmtinfo_a->stmt,
 538                                            "READ_WRITE dependence"
 539                                            " in interleaving.\n");
 540
 541           if (loop->safelen < 2)
 542             {
 543               tree indicator = dr_zero_step_indicator (dra);
 544               if (!indicator || integer_zerop (indicator))
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "access also has a zero step\n");
 547               else if (TREE_CODE (indicator) != INTEGER_CST)
 548                 vect_check_nonzero_value (loop_vinfo, indicator);
 549             }
 550           continue;
 551         }
 552
 553       if (dist > 0 && DDR_REVERSED_P (ddr))
 554         {
 555           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 556              reversed (to make distance vector positive), and the actual
 557              distance is negative.  */
 558           if (dump_enabled_p ())
 559             dump_printf_loc (MSG_NOTE, vect_location,
 560                              "dependence distance negative.\n");
 561           /* When doing outer loop vectorization, we need to check if there is
 562              a backward dependence at the inner loop level if the dependence
 563              at the outer loop is reversed.  See PR81740.  */
 564           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 565               || nested_in_vect_loop_p (loop, stmtinfo_b))
 566             {
 567               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 568                                                          DDR_LOOP_NEST (ddr));
 569               if (dist_v[inner_depth] < 0)
 570                 return opt_result::failure_at (stmtinfo_a->stmt,
 571                                                "not vectorized, dependence "
 572                                                "between data-refs %T and %T\n",
 573                                                DR_REF (dra), DR_REF (drb));
 574             }
 575           /* Record a negative dependence distance to later limit the
 576              amount of stmt copying / unrolling we can perform.
 577              Only need to handle read-after-write dependence.  */
 578           if (DR_IS_READ (drb)
 579               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 580                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 581             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 582           continue;
 583         }
 584
 585       unsigned int abs_dist = abs (dist);
 586       if (abs_dist >= 2 && abs_dist < *max_vf)
 587         {
 588           /* The dependence distance requires reduction of the maximal
 589              vectorization factor.  */
 590           *max_vf = abs_dist;
 591           if (dump_enabled_p ())
 592             dump_printf_loc (MSG_NOTE, vect_location,
 593                              "adjusting maximal vectorization factor to %i\n",
 594                              *max_vf);
 595         }
 596
 597       if (abs_dist >= *max_vf)
 598         {
 599           /* Dependence distance does not create dependence, as far as
 600              vectorization is concerned, in this case.  */
 601           if (dump_enabled_p ())
 602             dump_printf_loc (MSG_NOTE, vect_location,
 603                              "dependence distance >= VF.\n");
 604           continue;
 605         }
 606
 607       return opt_result::failure_at (stmtinfo_a->stmt,
 608                                      "not vectorized, possible dependence "
 609                                      "between data-refs %T and %T\n",
 610                                      DR_REF (dra), DR_REF (drb));
 611     }
 612
 613   return opt_result::success ();
 614 }
 615
 616 /* Funcion vect_analyze_early_break_dependences.
 617
 618    Examime all the data references in the loop and make sure that if we have
 619    mulitple exits that we are able to safely move stores such that they become
 620    safe for vectorization.  The function also calculates the place where to move
 621    the instructions to and computes what the new vUSE chain should be.
 622
 623    This works in tandem with the CFG that will be produced by
 624    slpeel_tree_duplicate_loop_to_edge_cfg later on.
 625
 626    This function tries to validate whether an early break vectorization
 627    is possible for the current instruction sequence. Returns True i
 628    possible, otherwise False.
 629
 630    Requirements:
 631      - Any memory access must be to a fixed size buffer.
 632      - There must not be any loads and stores to the same object.
 633      - Multiple loads are allowed as long as they don't alias.
 634
 635    NOTE:
 636      This implemementation is very conservative. Any overlappig loads/stores
 637      that take place before the early break statement gets rejected aside from
 638      WAR dependencies.
 639
 640      i.e.:
 641
 642         a[i] = 8
 643         c = a[i]
 644         if (b[i])
 645           ...
 646
 647         is not allowed, but
 648
 649         c = a[i]
 650         a[i] = 8
 651         if (b[i])
 652           ...
 653
 654         is which is the common case.  */
 655
 656 static opt_result
 657 vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
 658 {
 659   DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
 660
 661   /* List of all load data references found during traversal.  */
 662   auto_vec<data_reference *> bases;
 663   basic_block dest_bb = NULL;
 664
 665   hash_set <gimple *> visited;
 666   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 667   class loop *loop_nest = loop_outer (loop);
 668
 669   if (dump_enabled_p ())
 670     dump_printf_loc (MSG_NOTE, vect_location,
 671                      "loop contains multiple exits, analyzing"
 672                      " statement dependencies.\n");
 673
 674   for (gimple *c : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
 675     {
 676       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (c);
 677       if (STMT_VINFO_TYPE (loop_cond_info) != loop_exit_ctrl_vec_info_type)
 678         continue;
 679
 680       gimple_stmt_iterator gsi = gsi_for_stmt (c);
 681
 682       /* Now analyze all the remaining statements and try to determine which
 683          instructions are allowed/needed to be moved.  */
 684       while (!gsi_end_p (gsi))
 685         {
 686           gimple *stmt = gsi_stmt (gsi);
 687           gsi_prev (&gsi);
 688           if (!gimple_has_ops (stmt)
 689               || is_gimple_debug (stmt))
 690             continue;
 691
 692           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
 693           auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
 694           if (!dr_ref)
 695             continue;
 696
 697           /* We currently only support statically allocated objects due to
 698              not having first-faulting loads support or peeling for
 699              alignment support.  Compute the size of the referenced object
 700              (it could be dynamically allocated).  */
 701           tree obj = DR_BASE_ADDRESS (dr_ref);
 702           if (!obj || TREE_CODE (obj) != ADDR_EXPR)
 703             {
 704               if (dump_enabled_p ())
 705                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 706                                  "early breaks only supported on statically"
 707                                  " allocated objects.\n");
 708               return opt_result::failure_at (c,
 709                                  "can't safely apply code motion to "
 710                                  "dependencies of %G to vectorize "
 711                                  "the early exit.\n", c);
 712             }
 713
 714           tree refop = TREE_OPERAND (obj, 0);
 715           tree refbase = get_base_address (refop);
 716           if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
 717               || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
 718             {
 719               if (dump_enabled_p ())
 720                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                                  "early breaks only supported on"
 722                                  " statically allocated objects.\n");
 723               return opt_result::failure_at (c,
 724                                  "can't safely apply code motion to "
 725                                  "dependencies of %G to vectorize "
 726                                  "the early exit.\n", c);
 727             }
 728
 729           /* Check if vector accesses to the object will be within bounds.
 730              must be a constant or assume loop will be versioned or niters
 731              bounded by VF so accesses are within range.  */
 732           if (!ref_within_array_bound (stmt, DR_REF (dr_ref)))
 733             {
 734               if (dump_enabled_p ())
 735                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 736                                  "early breaks not supported: vectorization "
 737                                  "would %s beyond size of obj.",
 738                                  DR_IS_READ (dr_ref) ? "read" : "write");
 739               return opt_result::failure_at (c,
 740                                  "can't safely apply code motion to "
 741                                  "dependencies of %G to vectorize "
 742                                  "the early exit.\n", c);
 743             }
 744
 745           if (DR_IS_READ (dr_ref))
 746             bases.safe_push (dr_ref);
 747           else if (DR_IS_WRITE (dr_ref))
 748             {
 749               /* We are moving writes down in the CFG.  To be sure that this
 750                  is valid after vectorization we have to check all the loads
 751                  we are sinking the stores past to see if any of them may
 752                  alias or are the same object.
 753
 754                  Same objects will not be an issue because unless the store
 755                  is marked volatile the value can be forwarded.  If the
 756                  store is marked volatile we don't vectorize the loop
 757                  anyway.
 758
 759                  That leaves the check for aliasing.  We don't really need
 760                  to care about the stores aliasing with each other since the
 761                  stores are moved in order so the effects are still observed
 762                  correctly.  This leaves the check for WAR dependencies
 763                  which we would be introducing here if the DR can alias.
 764                  The check is quadratic in loads/stores but I have not found
 765                  a better API to do this.  I believe all loads and stores
 766                  must be checked.  We also must check them when we
 767                  encountered the store, since we don't care about loads past
 768                  the store.  */
 769
 770               for (auto dr_read : bases)
 771                 if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
 772                   {
 773                     if (dump_enabled_p ())
 774                       dump_printf_loc (MSG_MISSED_OPTIMIZATION,
 775                                        vect_location,
 776                                        "early breaks not supported: "
 777                                        "overlapping loads and stores "
 778                                        "found before the break "
 779                                        "statement.\n");
 780
 781                     return opt_result::failure_at (stmt,
 782                              "can't safely apply code motion to dependencies"
 783                              " to vectorize the early exit. %G may alias with"
 784                              " %G\n", stmt, dr_read->stmt);
 785                   }
 786             }
 787
 788           if (gimple_vdef (stmt))
 789             {
 790               if (dump_enabled_p ())
 791                 dump_printf_loc (MSG_NOTE, vect_location,
 792                                  "==> recording stmt %G", stmt);
 793
 794               LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
 795             }
 796           else if (gimple_vuse (stmt))
 797             {
 798               LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
 799               if (dump_enabled_p ())
 800                 dump_printf_loc (MSG_NOTE, vect_location,
 801                                  "marked statement for vUSE update: %G", stmt);
 802             }
 803         }
 804
 805       /* Save destination as we go, BB are visited in order and the last one
 806         is where statements should be moved to.  */
 807       if (!dest_bb)
 808         dest_bb = gimple_bb (c);
 809       else
 810         {
 811           basic_block curr_bb = gimple_bb (c);
 812           if (dominated_by_p (CDI_DOMINATORS, curr_bb, dest_bb))
 813             dest_bb = curr_bb;
 814         }
 815     }
 816
 817   basic_block dest_bb0 = EDGE_SUCC (dest_bb, 0)->dest;
 818   basic_block dest_bb1 = EDGE_SUCC (dest_bb, 1)->dest;
 819   dest_bb = flow_bb_inside_loop_p (loop, dest_bb0) ? dest_bb0 : dest_bb1;
 820   /* We don't allow outer -> inner loop transitions which should have been
 821      trapped already during loop form analysis.  */
 822   gcc_assert (dest_bb->loop_father == loop);
 823
 824   gcc_assert (dest_bb);
 825   LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
 826
 827   if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
 828     {
 829       /* All uses shall be updated to that of the first load.  Entries are
 830          stored in reverse order.  */
 831       tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
 832       for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
 833         {
 834           if (dump_enabled_p ())
 835           dump_printf_loc (MSG_NOTE, vect_location,
 836                            "will update use: %T, mem_ref: %G", vuse, g);
 837         }
 838     }
 839
 840   if (dump_enabled_p ())
 841     dump_printf_loc (MSG_NOTE, vect_location,
 842                      "recorded statements to be moved to BB %d\n",
 843                      LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
 844
 845   return opt_result::success ();
 846 }
 847
 848 /* Function vect_analyze_data_ref_dependences.
 849
 850    Examine all the data references in the loop, and make sure there do not
 851    exist any data dependences between them.  Set *MAX_VF according to
 852    the maximum vectorization factor the data dependences allow.  */
 853
 854 opt_result
 855 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 856                                    unsigned int *max_vf)
 857 {
 858   unsigned int i;
 859   struct data_dependence_relation *ddr;
 860
 861   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 862
 863   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 864     {
 865       LOOP_VINFO_DDRS (loop_vinfo)
 866         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 867                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 868       /* We do not need read-read dependences.  */
 869       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 870                                           &LOOP_VINFO_DDRS (loop_vinfo),
 871                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 872                                           false);
 873       gcc_assert (res);
 874     }
 875
 876   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 877
 878   /* For epilogues we either have no aliases or alias versioning
 879      was applied to original loop.  Therefore we may just get max_vf
 880      using VF of original loop.  */
 881   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 882     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 883   else
 884     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 885       {
 886         opt_result res
 887           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 888         if (!res)
 889           return res;
 890       }
 891
 892   /* If we have early break statements in the loop, check to see if they
 893      are of a form we can vectorizer.  */
 894   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
 895     return vect_analyze_early_break_dependences (loop_vinfo);
 896
 897   return opt_result::success ();
 898 }
 899
 900
 901 /* Function vect_slp_analyze_data_ref_dependence.
 902
 903    Return TRUE if there (might) exist a dependence between a memory-reference
 904    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 905    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 906    according to the data dependence.  */
 907
 908 static bool
 909 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 910                                       struct data_dependence_relation *ddr)
 911 {
 912   struct data_reference *dra = DDR_A (ddr);
 913   struct data_reference *drb = DDR_B (ddr);
 914   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 915   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 916
 917   /* We need to check dependences of statements marked as unvectorizable
 918      as well, they still can prohibit vectorization.  */
 919
 920   /* Independent data accesses.  */
 921   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 922     return false;
 923
 924   if (dra == drb)
 925     return false;
 926
 927   /* Read-read is OK.  */
 928   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 929     return false;
 930
 931   /* If dra and drb are part of the same interleaving chain consider
 932      them independent.  */
 933   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 934       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 935           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 936     return false;
 937
 938   /* Unknown data dependence.  */
 939   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 940     {
 941       if  (dump_enabled_p ())
 942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 943                          "can't determine dependence between %T and %T\n",
 944                          DR_REF (dra), DR_REF (drb));
 945     }
 946   else if (dump_enabled_p ())
 947     dump_printf_loc (MSG_NOTE, vect_location,
 948                      "determined dependence between %T and %T\n",
 949                      DR_REF (dra), DR_REF (drb));
 950
 951   return true;
 952 }
 953
 954
 955 /* Analyze dependences involved in the transform of a store SLP NODE.  */
 956
 957 static bool
 958 vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
 959 {
 960   /* This walks over all stmts involved in the SLP store done
 961      in NODE verifying we can sink them up to the last stmt in the
 962      group.  */
 963   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 964   gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
 965
 966   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
 967     {
 968       stmt_vec_info access_info
 969         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
 970       if (access_info == last_access_info)
 971         continue;
 972       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 973       ao_ref ref;
 974       bool ref_initialized_p = false;
 975       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 976            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 977         {
 978           gimple *stmt = gsi_stmt (gsi);
 979           if (! gimple_vuse (stmt))
 980             continue;
 981
 982           /* If we couldn't record a (single) data reference for this
 983              stmt we have to resort to the alias oracle.  */
 984           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 985           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 986           if (!dr_b)
 987             {
 988               /* We are moving a store - this means
 989                  we cannot use TBAA for disambiguation.  */
 990               if (!ref_initialized_p)
 991                 ao_ref_init (&ref, DR_REF (dr_a));
 992               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 993                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 994                 return false;
 995               continue;
 996             }
 997
 998           gcc_assert (!gimple_visited_p (stmt));
 999
1000           ddr_p ddr = initialize_data_dependence_relation (dr_a,
1001                                                            dr_b, vNULL);
1002           bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1003           free_dependence_relation (ddr);
1004           if (dependent)
1005             return false;
1006         }
1007     }
1008   return true;
1009 }
1010
1011 /* Analyze dependences involved in the transform of a load SLP NODE.  STORES
1012    contain the vector of scalar stores of this instance if we are
1013    disambiguating the loads.  */
1014
1015 static bool
1016 vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1017                                    vec<stmt_vec_info> stores,
1018                                    stmt_vec_info last_store_info)
1019 {
1020   /* This walks over all stmts involved in the SLP load done
1021      in NODE verifying we can hoist them up to the first stmt in the
1022      group.  */
1023   stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1024   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1025
1026   for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1027     {
1028       stmt_vec_info access_info
1029         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1030       if (access_info == first_access_info)
1031         continue;
1032       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1033       ao_ref ref;
1034       bool ref_initialized_p = false;
1035       hash_set<stmt_vec_info> grp_visited;
1036       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1037            gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1038         {
1039           gimple *stmt = gsi_stmt (gsi);
1040           if (! gimple_vdef (stmt))
1041             continue;
1042
1043           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1044
1045           /* If we run into a store of this same instance (we've just
1046              marked those) then delay dependence checking until we run
1047              into the last store because this is where it will have
1048              been sunk to (and we verified that we can do that already).  */
1049           if (gimple_visited_p (stmt))
1050             {
1051               if (stmt_info != last_store_info)
1052                 continue;
1053
1054               for (stmt_vec_info &store_info : stores)
1055                 {
1056                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1057                   ddr_p ddr = initialize_data_dependence_relation
1058                                 (dr_a, store_dr, vNULL);
1059                   bool dependent
1060                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1061                   free_dependence_relation (ddr);
1062                   if (dependent)
1063                     return false;
1064                 }
1065               continue;
1066             }
1067
1068           auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1069             {
1070               /* We are hoisting a load - this means we can use TBAA for
1071                  disambiguation.  */
1072               if (!ref_initialized_p)
1073                 ao_ref_init (&ref, DR_REF (dr_a));
1074               if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1075                 {
1076                   /* If we couldn't record a (single) data reference for this
1077                      stmt we have to give up now.  */
1078                   data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1079                   if (!dr_b)
1080                     return false;
1081                   ddr_p ddr = initialize_data_dependence_relation (dr_a,
1082                                                                    dr_b, vNULL);
1083                   bool dependent
1084                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1085                   free_dependence_relation (ddr);
1086                   if (dependent)
1087                     return false;
1088                 }
1089               /* No dependence.  */
1090               return true;
1091             };
1092           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1093             {
1094               /* When we run into a store group we have to honor
1095                  that earlier stores might be moved here.  We don't
1096                  know exactly which and where to since we lack a
1097                  back-mapping from DR to SLP node, so assume all
1098                  earlier stores are sunk here.  It's enough to
1099                  consider the last stmt of a group for this.
1100                  ???  Both this and the fact that we disregard that
1101                  the conflicting instance might be removed later
1102                  is overly conservative.  */
1103               if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1104                 for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1105                      store_info != NULL;
1106                      store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1107                   if ((store_info == stmt_info
1108                        || get_later_stmt (store_info, stmt_info) == stmt_info)
1109                       && !check_hoist (store_info))
1110                     return false;
1111             }
1112           else
1113             {
1114               if (!check_hoist (stmt_info))
1115                 return false;
1116             }
1117         }
1118     }
1119   return true;
1120 }
1121
1122
1123 /* Function vect_analyze_data_ref_dependences.
1124
1125    Examine all the data references in the basic-block, and make sure there
1126    do not exist any data dependences between them.  Set *MAX_VF according to
1127    the maximum vectorization factor the data dependences allow.  */
1128
1129 bool
1130 vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1131 {
1132   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1133
1134   /* The stores of this instance are at the root of the SLP tree.  */
1135   slp_tree store = NULL;
1136   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1137     store = SLP_INSTANCE_TREE (instance);
1138
1139   /* Verify we can sink stores to the vectorized stmt insert location.  */
1140   stmt_vec_info last_store_info = NULL;
1141   if (store)
1142     {
1143       if (! vect_slp_analyze_store_dependences (vinfo, store))
1144         return false;
1145
1146       /* Mark stores in this instance and remember the last one.  */
1147       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1148       for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1149         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1150     }
1151
1152   bool res = true;
1153
1154   /* Verify we can sink loads to the vectorized stmt insert location,
1155      special-casing stores of this instance.  */
1156   for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1157     if (! vect_slp_analyze_load_dependences (vinfo, load,
1158                                              store
1159                                              ? SLP_TREE_SCALAR_STMTS (store)
1160                                              : vNULL, last_store_info))
1161       {
1162         res = false;
1163         break;
1164       }
1165
1166   /* Unset the visited flag.  */
1167   if (store)
1168     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1169       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1170
1171   return res;
1172 }
1173
1174 /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1175    applied.  */
1176
1177 int
1178 dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1179 {
1180   HOST_WIDE_INT diff = 0;
1181   /* Alignment is only analyzed for the first element of a DR group,
1182      use that but adjust misalignment by the offset of the access.  */
1183   if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1184     {
1185       dr_vec_info *first_dr
1186         = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1187       /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1188          INTEGER_CSTs and the first element in the group has the lowest
1189          address.  */
1190       diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1191               - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1192       gcc_assert (diff >= 0);
1193       dr_info = first_dr;
1194     }
1195
1196   int misalign = dr_info->misalignment;
1197   gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1198   if (misalign == DR_MISALIGNMENT_UNKNOWN)
1199     return misalign;
1200
1201   /* If the access is only aligned for a vector type with smaller alignment
1202      requirement the access has unknown misalignment.  */
1203   if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1204                 targetm.vectorize.preferred_vector_alignment (vectype)))
1205     return DR_MISALIGNMENT_UNKNOWN;
1206
1207   /* Apply the offset from the DR group start and the externally supplied
1208      offset which can for example result from a negative stride access.  */
1209   poly_int64 misalignment = misalign + diff + offset;
1210
1211   /* vect_compute_data_ref_alignment will have ensured that target_alignment
1212      is constant and otherwise set misalign to DR_MISALIGNMENT_UNKNOWN.  */
1213   unsigned HOST_WIDE_INT target_alignment_c
1214     = dr_info->target_alignment.to_constant ();
1215   if (!known_misalignment (misalignment, target_alignment_c, &misalign))
1216     return DR_MISALIGNMENT_UNKNOWN;
1217   return misalign;
1218 }
1219
1220 /* Record the base alignment guarantee given by DRB, which occurs
1221    in STMT_INFO.  */
1222
1223 static void
1224 vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1225                             innermost_loop_behavior *drb)
1226 {
1227   bool existed;
1228   std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1229     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1230   if (!existed || entry.second->base_alignment < drb->base_alignment)
1231     {
1232       entry = std::make_pair (stmt_info, drb);
1233       if (dump_enabled_p ())
1234         dump_printf_loc (MSG_NOTE, vect_location,
1235                          "recording new base alignment for %T\n"
1236                          "  alignment:    %d\n"
1237                          "  misalignment: %d\n"
1238                          "  based on:     %G",
1239                          drb->base_address,
1240                          drb->base_alignment,
1241                          drb->base_misalignment,
1242                          stmt_info->stmt);
1243     }
1244 }
1245
1246 /* If the region we're going to vectorize is reached, all unconditional
1247    data references occur at least once.  We can therefore pool the base
1248    alignment guarantees from each unconditional reference.  Do this by
1249    going through all the data references in VINFO and checking whether
1250    the containing statement makes the reference unconditionally.  If so,
1251    record the alignment of the base address in VINFO so that it can be
1252    used for all other references with the same base.  */
1253
1254 void
1255 vect_record_base_alignments (vec_info *vinfo)
1256 {
1257   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1258   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1259   for (data_reference *dr : vinfo->shared->datarefs)
1260     {
1261       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1262       stmt_vec_info stmt_info = dr_info->stmt;
1263       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1264           && STMT_VINFO_VECTORIZABLE (stmt_info)
1265           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1266         {
1267           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1268
1269           /* If DR is nested in the loop that is being vectorized, we can also
1270              record the alignment of the base wrt the outer loop.  */
1271           if (loop && nested_in_vect_loop_p (loop, stmt_info))
1272             vect_record_base_alignment
1273               (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1274         }
1275     }
1276 }
1277
1278 /* Function vect_compute_data_ref_alignment
1279
1280    Compute the misalignment of the data reference DR_INFO when vectorizing
1281    with VECTYPE.
1282
1283    Output:
1284    1. initialized misalignment info for DR_INFO
1285
1286    FOR NOW: No analysis is actually performed. Misalignment is calculated
1287    only for trivial cases. TODO.  */
1288
1289 static void
1290 vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1291                                  tree vectype)
1292 {
1293   stmt_vec_info stmt_info = dr_info->stmt;
1294   vec_base_alignments *base_alignments = &vinfo->base_alignments;
1295   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1296   class loop *loop = NULL;
1297   tree ref = DR_REF (dr_info->dr);
1298
1299   if (dump_enabled_p ())
1300     dump_printf_loc (MSG_NOTE, vect_location,
1301                      "vect_compute_data_ref_alignment:\n");
1302
1303   if (loop_vinfo)
1304     loop = LOOP_VINFO_LOOP (loop_vinfo);
1305
1306   /* Initialize misalignment to unknown.  */
1307   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1308
1309   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1310     return;
1311
1312   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1313   bool step_preserves_misalignment_p;
1314
1315   poly_uint64 vector_alignment
1316     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1317                  BITS_PER_UNIT);
1318   SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1319
1320   /* If the main loop has peeled for alignment we have no way of knowing
1321      whether the data accesses in the epilogues are aligned.  We can't at
1322      compile time answer the question whether we have entered the main loop or
1323      not.  Fixes PR 92351.  */
1324   if (loop_vinfo)
1325     {
1326       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1327       if (orig_loop_vinfo
1328           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1329         return;
1330     }
1331
1332   unsigned HOST_WIDE_INT vect_align_c;
1333   if (!vector_alignment.is_constant (&vect_align_c))
1334     return;
1335
1336   /* No step for BB vectorization.  */
1337   if (!loop)
1338     {
1339       gcc_assert (integer_zerop (drb->step));
1340       step_preserves_misalignment_p = true;
1341     }
1342
1343   /* In case the dataref is in an inner-loop of the loop that is being
1344      vectorized (LOOP), we use the base and misalignment information
1345      relative to the outer-loop (LOOP).  This is ok only if the misalignment
1346      stays the same throughout the execution of the inner-loop, which is why
1347      we have to check that the stride of the dataref in the inner-loop evenly
1348      divides by the vector alignment.  */
1349   else if (nested_in_vect_loop_p (loop, stmt_info))
1350     {
1351       step_preserves_misalignment_p
1352         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1353
1354       if (dump_enabled_p ())
1355         {
1356           if (step_preserves_misalignment_p)
1357             dump_printf_loc (MSG_NOTE, vect_location,
1358                              "inner step divides the vector alignment.\n");
1359           else
1360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1361                              "inner step doesn't divide the vector"
1362                              " alignment.\n");
1363         }
1364     }
1365
1366   /* Similarly we can only use base and misalignment information relative to
1367      an innermost loop if the misalignment stays the same throughout the
1368      execution of the loop.  As above, this is the case if the stride of
1369      the dataref evenly divides by the alignment.  */
1370   else
1371     {
1372       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1373       step_preserves_misalignment_p
1374         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
1375
1376       if (!step_preserves_misalignment_p && dump_enabled_p ())
1377         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378                          "step doesn't divide the vector alignment.\n");
1379     }
1380
1381   unsigned int base_alignment = drb->base_alignment;
1382   unsigned int base_misalignment = drb->base_misalignment;
1383
1384   /* Calculate the maximum of the pooled base address alignment and the
1385      alignment that we can compute for DR itself.  */
1386   std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1387     = base_alignments->get (drb->base_address);
1388   if (entry
1389       && base_alignment < (*entry).second->base_alignment
1390       && (loop_vinfo
1391           || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1392                               gimple_bb (entry->first->stmt))
1393               && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1394                   || (entry->first->dr_aux.group <= dr_info->group)))))
1395     {
1396       base_alignment = entry->second->base_alignment;
1397       base_misalignment = entry->second->base_misalignment;
1398     }
1399
1400   if (drb->offset_alignment < vect_align_c
1401       || !step_preserves_misalignment_p
1402       /* We need to know whether the step wrt the vectorized loop is
1403          negative when computing the starting misalignment below.  */
1404       || TREE_CODE (drb->step) != INTEGER_CST)
1405     {
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1408                          "Unknown alignment for access: %T\n", ref);
1409       return;
1410     }
1411
1412   if (base_alignment < vect_align_c)
1413     {
1414       unsigned int max_alignment;
1415       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1416       if (max_alignment < vect_align_c
1417           || !vect_can_force_dr_alignment_p (base,
1418                                              vect_align_c * BITS_PER_UNIT))
1419         {
1420           if (dump_enabled_p ())
1421             dump_printf_loc (MSG_NOTE, vect_location,
1422                              "can't force alignment of ref: %T\n", ref);
1423           return;
1424         }
1425
1426       /* Force the alignment of the decl.
1427          NOTE: This is the only change to the code we make during
1428          the analysis phase, before deciding to vectorize the loop.  */
1429       if (dump_enabled_p ())
1430         dump_printf_loc (MSG_NOTE, vect_location,
1431                          "force alignment of %T\n", ref);
1432
1433       dr_info->base_decl = base;
1434       dr_info->base_misaligned = true;
1435       base_misalignment = 0;
1436     }
1437   poly_int64 misalignment
1438     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1439
1440   unsigned int const_misalignment;
1441   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1442     {
1443       if (dump_enabled_p ())
1444         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1445                          "Non-constant misalignment for access: %T\n", ref);
1446       return;
1447     }
1448
1449   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1450
1451   if (dump_enabled_p ())
1452     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                      "misalign = %d bytes of ref %T\n",
1454                      const_misalignment, ref);
1455
1456   return;
1457 }
1458
1459 /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1460    that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1461    is made aligned via peeling.  */
1462
1463 static bool
1464 vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1465                                          dr_vec_info *dr_peel_info)
1466 {
1467   if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1468                   DR_TARGET_ALIGNMENT (dr_info)))
1469     {
1470       poly_offset_int diff
1471         = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1472            - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1473       if (known_eq (diff, 0)
1474           || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1475         return true;
1476     }
1477   return false;
1478 }
1479
1480 /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1481    aligned via peeling.  */
1482
1483 static bool
1484 vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1485                                  dr_vec_info *dr_peel_info)
1486 {
1487   if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1488                         DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1489       || !operand_equal_p (DR_OFFSET (dr_info->dr),
1490                            DR_OFFSET (dr_peel_info->dr), 0)
1491       || !operand_equal_p (DR_STEP (dr_info->dr),
1492                            DR_STEP (dr_peel_info->dr), 0))
1493     return false;
1494
1495   return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1496 }
1497
1498 /* Compute the value for dr_info->misalign so that the access appears
1499    aligned.  This is used by peeling to compensate for dr_misalignment
1500    applying the offset for negative step.  */
1501
1502 int
1503 vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1504 {
1505   if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1506     return 0;
1507
1508   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1509   poly_int64 misalignment
1510     = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1511        * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1512
1513   unsigned HOST_WIDE_INT target_alignment_c;
1514   int misalign;
1515   if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1516       || !known_misalignment (misalignment, target_alignment_c, &misalign))
1517     return DR_MISALIGNMENT_UNKNOWN;
1518   return misalign;
1519 }
1520
1521 /* Function vect_update_misalignment_for_peel.
1522    Sets DR_INFO's misalignment
1523    - to 0 if it has the same alignment as DR_PEEL_INFO,
1524    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1525    - to -1 (unknown) otherwise.
1526
1527    DR_INFO - the data reference whose misalignment is to be adjusted.
1528    DR_PEEL_INFO - the data reference whose misalignment is being made
1529                   zero in the vector loop by the peel.
1530    NPEEL - the number of iterations in the peel loop if the misalignment
1531            of DR_PEEL_INFO is known at compile time.  */
1532
1533 static void
1534 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1535                                    dr_vec_info *dr_peel_info, int npeel)
1536 {
1537   /* If dr_info is aligned of dr_peel_info is, then mark it so.  */
1538   if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1539     {
1540       SET_DR_MISALIGNMENT (dr_info,
1541                            vect_dr_misalign_for_aligned_access (dr_peel_info));
1542       return;
1543     }
1544
1545   unsigned HOST_WIDE_INT alignment;
1546   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1547       && known_alignment_for_access_p (dr_info,
1548                                        STMT_VINFO_VECTYPE (dr_info->stmt))
1549       && known_alignment_for_access_p (dr_peel_info,
1550                                        STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1551     {
1552       int misal = dr_info->misalignment;
1553       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1554       misal &= alignment - 1;
1555       set_dr_misalignment (dr_info, misal);
1556       return;
1557     }
1558
1559   if (dump_enabled_p ())
1560     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1561                      "to unknown (-1).\n");
1562   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1563 }
1564
1565 /* Return true if alignment is relevant for DR_INFO.  */
1566
1567 static bool
1568 vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1569 {
1570   stmt_vec_info stmt_info = dr_info->stmt;
1571
1572   if (!STMT_VINFO_RELEVANT_P (stmt_info))
1573     return false;
1574
1575   /* For interleaving, only the alignment of the first access matters.  */
1576   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1577       && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1578     return false;
1579
1580   /* Scatter-gather and invariant accesses continue to address individual
1581      scalars, so vector-level alignment is irrelevant.  */
1582   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1583       || integer_zerop (DR_STEP (dr_info->dr)))
1584     return false;
1585
1586   /* Strided accesses perform only component accesses, alignment is
1587      irrelevant for them.  */
1588   if (STMT_VINFO_STRIDED_P (stmt_info)
1589       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1590     return false;
1591
1592   return true;
1593 }
1594
1595 /* Given an memory reference EXP return whether its alignment is less
1596    than its size.  */
1597
1598 static bool
1599 not_size_aligned (tree exp)
1600 {
1601   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1602     return true;
1603
1604   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1605           > get_object_alignment (exp));
1606 }
1607
1608 /* Function vector_alignment_reachable_p
1609
1610    Return true if vector alignment for DR_INFO is reachable by peeling
1611    a few loop iterations.  Return false otherwise.  */
1612
1613 static bool
1614 vector_alignment_reachable_p (dr_vec_info *dr_info)
1615 {
1616   stmt_vec_info stmt_info = dr_info->stmt;
1617   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1618
1619   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1620     {
1621       /* For interleaved access we peel only if number of iterations in
1622          the prolog loop ({VF - misalignment}), is a multiple of the
1623          number of the interleaved accesses.  */
1624       int elem_size, mis_in_elements;
1625
1626       /* FORNOW: handle only known alignment.  */
1627       if (!known_alignment_for_access_p (dr_info, vectype))
1628         return false;
1629
1630       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1631       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1632       elem_size = vector_element_size (vector_size, nelements);
1633       mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1634
1635       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1636         return false;
1637     }
1638
1639   /* If misalignment is known at the compile time then allow peeling
1640      only if natural alignment is reachable through peeling.  */
1641   if (known_alignment_for_access_p (dr_info, vectype)
1642       && !aligned_access_p (dr_info, vectype))
1643     {
1644       HOST_WIDE_INT elmsize =
1645                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1646       if (dump_enabled_p ())
1647         {
1648           dump_printf_loc (MSG_NOTE, vect_location,
1649                            "data size = %wd. misalignment = %d.\n", elmsize,
1650                            dr_misalignment (dr_info, vectype));
1651         }
1652       if (dr_misalignment (dr_info, vectype) % elmsize)
1653         {
1654           if (dump_enabled_p ())
1655             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656                              "data size does not divide the misalignment.\n");
1657           return false;
1658         }
1659     }
1660
1661   if (!known_alignment_for_access_p (dr_info, vectype))
1662     {
1663       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1664       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1665       if (dump_enabled_p ())
1666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667                          "Unknown misalignment, %snaturally aligned\n",
1668                          is_packed ? "not " : "");
1669       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1670     }
1671
1672   return true;
1673 }
1674
1675
1676 /* Calculate the cost of the memory access represented by DR_INFO.  */
1677
1678 static void
1679 vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1680                            dr_alignment_support alignment_support_scheme,
1681                            int misalignment,
1682                            unsigned int *inside_cost,
1683                            unsigned int *outside_cost,
1684                            stmt_vector_for_cost *body_cost_vec,
1685                            stmt_vector_for_cost *prologue_cost_vec)
1686 {
1687   stmt_vec_info stmt_info = dr_info->stmt;
1688   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1689   int ncopies;
1690
1691   if (PURE_SLP_STMT (stmt_info))
1692     ncopies = 1;
1693   else
1694     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1695
1696   if (DR_IS_READ (dr_info->dr))
1697     vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
1698                         misalignment, true, inside_cost,
1699                         outside_cost, prologue_cost_vec, body_cost_vec, false);
1700   else
1701     vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
1702                          misalignment, inside_cost, body_cost_vec);
1703
1704   if (dump_enabled_p ())
1705     dump_printf_loc (MSG_NOTE, vect_location,
1706                      "vect_get_data_access_cost: inside_cost = %d, "
1707                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1708 }
1709
1710
1711 typedef struct _vect_peel_info
1712 {
1713   dr_vec_info *dr_info;
1714   int npeel;
1715   unsigned int count;
1716 } *vect_peel_info;
1717
1718 typedef struct _vect_peel_extended_info
1719 {
1720   vec_info *vinfo;
1721   struct _vect_peel_info peel_info;
1722   unsigned int inside_cost;
1723   unsigned int outside_cost;
1724 } *vect_peel_extended_info;
1725
1726
1727 /* Peeling hashtable helpers.  */
1728
1729 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1730 {
1731   static inline hashval_t hash (const _vect_peel_info *);
1732   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1733 };
1734
1735 inline hashval_t
1736 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1737 {
1738   return (hashval_t) peel_info->npeel;
1739 }
1740
1741 inline bool
1742 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1743 {
1744   return (a->npeel == b->npeel);
1745 }
1746
1747
1748 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1749
1750 static void
1751 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1752                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1753                           int npeel, bool supportable_if_not_aligned)
1754 {
1755   struct _vect_peel_info elem, *slot;
1756   _vect_peel_info **new_slot;
1757
1758   elem.npeel = npeel;
1759   slot = peeling_htab->find (&elem);
1760   if (slot)
1761     slot->count++;
1762   else
1763     {
1764       slot = XNEW (struct _vect_peel_info);
1765       slot->npeel = npeel;
1766       slot->dr_info = dr_info;
1767       slot->count = 1;
1768       new_slot = peeling_htab->find_slot (slot, INSERT);
1769       *new_slot = slot;
1770     }
1771
1772   /* If this DR is not supported with unknown misalignment then bias
1773      this slot when the cost model is disabled.  */
1774   if (!supportable_if_not_aligned
1775       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1776     slot->count += VECT_MAX_COST;
1777 }
1778
1779
1780 /* Traverse peeling hash table to find peeling option that aligns maximum
1781    number of data accesses.  */
1782
1783 int
1784 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1785                                      _vect_peel_extended_info *max)
1786 {
1787   vect_peel_info elem = *slot;
1788
1789   if (elem->count > max->peel_info.count
1790       || (elem->count == max->peel_info.count
1791           && max->peel_info.npeel > elem->npeel))
1792     {
1793       max->peel_info.npeel = elem->npeel;
1794       max->peel_info.count = elem->count;
1795       max->peel_info.dr_info = elem->dr_info;
1796     }
1797
1798   return 1;
1799 }
1800
1801 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1802    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1803    npeel is computed at runtime but DR0_INFO's misalignment will be zero
1804    after peeling.  */
1805
1806 static void
1807 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1808                                 dr_vec_info *dr0_info,
1809                                 unsigned int *inside_cost,
1810                                 unsigned int *outside_cost,
1811                                 stmt_vector_for_cost *body_cost_vec,
1812                                 stmt_vector_for_cost *prologue_cost_vec,
1813                                 unsigned int npeel)
1814 {
1815   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1816
1817   bool dr0_alignment_known_p
1818     = (dr0_info
1819        && known_alignment_for_access_p (dr0_info,
1820                                         STMT_VINFO_VECTYPE (dr0_info->stmt)));
1821
1822   for (data_reference *dr : datarefs)
1823     {
1824       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1825       if (!vect_relevant_for_alignment_p (dr_info))
1826         continue;
1827
1828       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1829       dr_alignment_support alignment_support_scheme;
1830       int misalignment;
1831       unsigned HOST_WIDE_INT alignment;
1832
1833       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1834                                             size_zero_node) < 0;
1835       poly_int64 off = 0;
1836       if (negative)
1837         off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1838                * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1839
1840       if (npeel == 0)
1841         misalignment = dr_misalignment (dr_info, vectype, off);
1842       else if (dr_info == dr0_info
1843                || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1844         misalignment = 0;
1845       else if (!dr0_alignment_known_p
1846                || !known_alignment_for_access_p (dr_info, vectype)
1847                || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1848         misalignment = DR_MISALIGNMENT_UNKNOWN;
1849       else
1850         {
1851           misalignment = dr_misalignment (dr_info, vectype, off);
1852           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1853           misalignment &= alignment - 1;
1854         }
1855       alignment_support_scheme
1856         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1857                                          misalignment);
1858
1859       vect_get_data_access_cost (loop_vinfo, dr_info,
1860                                  alignment_support_scheme, misalignment,
1861                                  inside_cost, outside_cost,
1862                                  body_cost_vec, prologue_cost_vec);
1863     }
1864 }
1865
1866 /* Traverse peeling hash table and calculate cost for each peeling option.
1867    Find the one with the lowest cost.  */
1868
1869 int
1870 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1871                                    _vect_peel_extended_info *min)
1872 {
1873   vect_peel_info elem = *slot;
1874   int dummy;
1875   unsigned int inside_cost = 0, outside_cost = 0;
1876   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1877   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1878                        epilogue_cost_vec;
1879
1880   prologue_cost_vec.create (2);
1881   body_cost_vec.create (2);
1882   epilogue_cost_vec.create (2);
1883
1884   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1885                                   &outside_cost, &body_cost_vec,
1886                                   &prologue_cost_vec, elem->npeel);
1887
1888   body_cost_vec.release ();
1889
1890   outside_cost += vect_get_known_peeling_cost
1891     (loop_vinfo, elem->npeel, &dummy,
1892      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1893      &prologue_cost_vec, &epilogue_cost_vec);
1894
1895   /* Prologue and epilogue costs are added to the target model later.
1896      These costs depend only on the scalar iteration cost, the
1897      number of peeling iterations finally chosen, and the number of
1898      misaligned statements.  So discard the information found here.  */
1899   prologue_cost_vec.release ();
1900   epilogue_cost_vec.release ();
1901
1902   if (inside_cost < min->inside_cost
1903       || (inside_cost == min->inside_cost
1904           && outside_cost < min->outside_cost))
1905     {
1906       min->inside_cost = inside_cost;
1907       min->outside_cost = outside_cost;
1908       min->peel_info.dr_info = elem->dr_info;
1909       min->peel_info.npeel = elem->npeel;
1910       min->peel_info.count = elem->count;
1911     }
1912
1913   return 1;
1914 }
1915
1916
1917 /* Choose best peeling option by traversing peeling hash table and either
1918    choosing an option with the lowest cost (if cost model is enabled) or the
1919    option that aligns as many accesses as possible.  */
1920
1921 static struct _vect_peel_extended_info
1922 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1923                                        loop_vec_info loop_vinfo)
1924 {
1925    struct _vect_peel_extended_info res;
1926
1927    res.peel_info.dr_info = NULL;
1928    res.vinfo = loop_vinfo;
1929
1930    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1931      {
1932        res.inside_cost = INT_MAX;
1933        res.outside_cost = INT_MAX;
1934        peeling_htab->traverse <_vect_peel_extended_info *,
1935                                vect_peeling_hash_get_lowest_cost> (&res);
1936      }
1937    else
1938      {
1939        res.peel_info.count = 0;
1940        peeling_htab->traverse <_vect_peel_extended_info *,
1941                                vect_peeling_hash_get_most_frequent> (&res);
1942        res.inside_cost = 0;
1943        res.outside_cost = 0;
1944      }
1945
1946    return res;
1947 }
1948
1949 /* Return true if the new peeling NPEEL is supported.  */
1950
1951 static bool
1952 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1953                           unsigned npeel)
1954 {
1955   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1956   enum dr_alignment_support supportable_dr_alignment;
1957
1958   bool dr0_alignment_known_p
1959     = known_alignment_for_access_p (dr0_info,
1960                                     STMT_VINFO_VECTYPE (dr0_info->stmt));
1961
1962   /* Ensure that all data refs can be vectorized after the peel.  */
1963   for (data_reference *dr : datarefs)
1964     {
1965       if (dr == dr0_info->dr)
1966         continue;
1967
1968       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1969       if (!vect_relevant_for_alignment_p (dr_info)
1970           || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1971         continue;
1972
1973       tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1974       int misalignment;
1975       unsigned HOST_WIDE_INT alignment;
1976       if (!dr0_alignment_known_p
1977           || !known_alignment_for_access_p (dr_info, vectype)
1978           || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1979         misalignment = DR_MISALIGNMENT_UNKNOWN;
1980       else
1981         {
1982           misalignment = dr_misalignment (dr_info, vectype);
1983           misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1984           misalignment &= alignment - 1;
1985         }
1986       supportable_dr_alignment
1987         = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1988                                          misalignment);
1989       if (supportable_dr_alignment == dr_unaligned_unsupported)
1990         return false;
1991     }
1992
1993   return true;
1994 }
1995
1996 /* Compare two data-references DRA and DRB to group them into chunks
1997    with related alignment.  */
1998
1999 static int
2000 dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2001 {
2002   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2003   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2004   int cmp;
2005
2006   /* Stabilize sort.  */
2007   if (dra == drb)
2008     return 0;
2009
2010   /* Ordering of DRs according to base.  */
2011   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2012                                DR_BASE_ADDRESS (drb));
2013   if (cmp != 0)
2014     return cmp;
2015
2016   /* And according to DR_OFFSET.  */
2017   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2018   if (cmp != 0)
2019     return cmp;
2020
2021   /* And after step.  */
2022   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2023   if (cmp != 0)
2024     return cmp;
2025
2026   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2027   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2028   if (cmp == 0)
2029     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2030   return cmp;
2031 }
2032
2033 /* Function vect_enhance_data_refs_alignment
2034
2035    This pass will use loop versioning and loop peeling in order to enhance
2036    the alignment of data references in the loop.
2037
2038    FOR NOW: we assume that whatever versioning/peeling takes place, only the
2039    original loop is to be vectorized.  Any other loops that are created by
2040    the transformations performed in this pass - are not supposed to be
2041    vectorized.  This restriction will be relaxed.
2042
2043    This pass will require a cost model to guide it whether to apply peeling
2044    or versioning or a combination of the two.  For example, the scheme that
2045    intel uses when given a loop with several memory accesses, is as follows:
2046    choose one memory access ('p') which alignment you want to force by doing
2047    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
2048    other accesses are not necessarily aligned, or (2) use loop versioning to
2049    generate one loop in which all accesses are aligned, and another loop in
2050    which only 'p' is necessarily aligned.
2051
2052    ("Automatic Intra-Register Vectorization for the Intel Architecture",
2053    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2054    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2055
2056    Devising a cost model is the most critical aspect of this work.  It will
2057    guide us on which access to peel for, whether to use loop versioning, how
2058    many versions to create, etc.  The cost model will probably consist of
2059    generic considerations as well as target specific considerations (on
2060    powerpc for example, misaligned stores are more painful than misaligned
2061    loads).
2062
2063    Here are the general steps involved in alignment enhancements:
2064
2065      -- original loop, before alignment analysis:
2066         for (i=0; i<N; i++){
2067           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
2068           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2069         }
2070
2071      -- After vect_compute_data_refs_alignment:
2072         for (i=0; i<N; i++){
2073           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2074           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
2075         }
2076
2077      -- Possibility 1: we do loop versioning:
2078      if (p is aligned) {
2079         for (i=0; i<N; i++){    # loop 1A
2080           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2081           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2082         }
2083      }
2084      else {
2085         for (i=0; i<N; i++){    # loop 1B
2086           x = q[i];                     # DR_MISALIGNMENT(q) = 3
2087           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2088         }
2089      }
2090
2091      -- Possibility 2: we do loop peeling:
2092      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2093         x = q[i];
2094         p[i] = y;
2095      }
2096      for (i = 3; i < N; i++){   # loop 2A
2097         x = q[i];                       # DR_MISALIGNMENT(q) = 0
2098         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
2099      }
2100
2101      -- Possibility 3: combination of loop peeling and versioning:
2102      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
2103         x = q[i];
2104         p[i] = y;
2105      }
2106      if (p is aligned) {
2107         for (i = 3; i<N; i++){  # loop 3A
2108           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2109           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
2110         }
2111      }
2112      else {
2113         for (i = 3; i<N; i++){  # loop 3B
2114           x = q[i];                     # DR_MISALIGNMENT(q) = 0
2115           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
2116         }
2117      }
2118
2119      These loops are later passed to loop_transform to be vectorized.  The
2120      vectorizer will use the alignment information to guide the transformation
2121      (whether to generate regular loads/stores, or with special handling for
2122      misalignment).  */
2123
2124 opt_result
2125 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2126 {
2127   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2128   dr_vec_info *first_store = NULL;
2129   dr_vec_info *dr0_info = NULL;
2130   struct data_reference *dr;
2131   unsigned int i;
2132   bool do_peeling = false;
2133   bool do_versioning = false;
2134   unsigned int npeel = 0;
2135   bool one_misalignment_known = false;
2136   bool one_misalignment_unknown = false;
2137   bool one_dr_unsupportable = false;
2138   dr_vec_info *unsupportable_dr_info = NULL;
2139   unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2140   hash_table<peel_info_hasher> peeling_htab (1);
2141
2142   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2143
2144   /* Reset data so we can safely be called multiple times.  */
2145   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2146   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2147
2148   if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2149     return opt_result::success ();
2150
2151   /* Sort the vector of datarefs so DRs that have the same or dependent
2152      alignment are next to each other.  */
2153   auto_vec<data_reference_p> datarefs
2154     = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2155   datarefs.qsort (dr_align_group_sort_cmp);
2156
2157   /* Compute the number of DRs that become aligned when we peel
2158      a dataref so it becomes aligned.  */
2159   auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2160   n_same_align_refs.quick_grow_cleared (datarefs.length ());
2161   unsigned i0;
2162   for (i0 = 0; i0 < datarefs.length (); ++i0)
2163     if (DR_BASE_ADDRESS (datarefs[i0]))
2164       break;
2165   for (i = i0 + 1; i <= datarefs.length (); ++i)
2166     {
2167       if (i == datarefs.length ()
2168           || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2169                                DR_BASE_ADDRESS (datarefs[i]), 0)
2170           || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2171                                DR_OFFSET (datarefs[i]), 0)
2172           || !operand_equal_p (DR_STEP (datarefs[i0]),
2173                                DR_STEP (datarefs[i]), 0))
2174         {
2175           /* The subgroup [i0, i-1] now only differs in DR_INIT and
2176              possibly DR_TARGET_ALIGNMENT.  Still the whole subgroup
2177              will get known misalignment if we align one of the refs
2178              with the largest DR_TARGET_ALIGNMENT.  */
2179           for (unsigned j = i0; j < i; ++j)
2180             {
2181               dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2182               for (unsigned k = i0; k < i; ++k)
2183                 {
2184                   if (k == j)
2185                     continue;
2186                   dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2187                   if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2188                                                                dr_infoj))
2189                     n_same_align_refs[j]++;
2190                 }
2191             }
2192           i0 = i;
2193         }
2194     }
2195
2196   /* While cost model enhancements are expected in the future, the high level
2197      view of the code at this time is as follows:
2198
2199      A) If there is a misaligned access then see if peeling to align
2200         this access can make all data references satisfy
2201         vect_supportable_dr_alignment.  If so, update data structures
2202         as needed and return true.
2203
2204      B) If peeling wasn't possible and there is a data reference with an
2205         unknown misalignment that does not satisfy vect_supportable_dr_alignment
2206         then see if loop versioning checks can be used to make all data
2207         references satisfy vect_supportable_dr_alignment.  If so, update
2208         data structures as needed and return true.
2209
2210      C) If neither peeling nor versioning were successful then return false if
2211         any data reference does not satisfy vect_supportable_dr_alignment.
2212
2213      D) Return true (all data references satisfy vect_supportable_dr_alignment).
2214
2215      Note, Possibility 3 above (which is peeling and versioning together) is not
2216      being done at this time.  */
2217
2218   /* (1) Peeling to force alignment.  */
2219
2220   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2221      Considerations:
2222      + How many accesses will become aligned due to the peeling
2223      - How many accesses will become unaligned due to the peeling,
2224        and the cost of misaligned accesses.
2225      - The cost of peeling (the extra runtime checks, the increase
2226        in code size).  */
2227
2228   FOR_EACH_VEC_ELT (datarefs, i, dr)
2229     {
2230       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2231       if (!vect_relevant_for_alignment_p (dr_info))
2232         continue;
2233
2234       stmt_vec_info stmt_info = dr_info->stmt;
2235       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2236       do_peeling = vector_alignment_reachable_p (dr_info);
2237       if (do_peeling)
2238         {
2239           if (known_alignment_for_access_p (dr_info, vectype))
2240             {
2241               unsigned int npeel_tmp = 0;
2242               bool negative = tree_int_cst_compare (DR_STEP (dr),
2243                                                     size_zero_node) < 0;
2244
2245               /* If known_alignment_for_access_p then we have set
2246                  DR_MISALIGNMENT which is only done if we know it at compiler
2247                  time, so it is safe to assume target alignment is constant.
2248                */
2249               unsigned int target_align =
2250                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2251               unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2252               poly_int64 off = 0;
2253               if (negative)
2254                 off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2255               unsigned int mis = dr_misalignment (dr_info, vectype, off);
2256               mis = negative ? mis : -mis;
2257               if (mis != 0)
2258                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
2259
2260               /* For multiple types, it is possible that the bigger type access
2261                  will have more than one peeling option.  E.g., a loop with two
2262                  types: one of size (vector size / 4), and the other one of
2263                  size (vector size / 8).  Vectorization factor will 8.  If both
2264                  accesses are misaligned by 3, the first one needs one scalar
2265                  iteration to be aligned, and the second one needs 5.  But the
2266                  first one will be aligned also by peeling 5 scalar
2267                  iterations, and in that case both accesses will be aligned.
2268                  Hence, except for the immediate peeling amount, we also want
2269                  to try to add full vector size, while we don't exceed
2270                  vectorization factor.
2271                  We do this automatically for cost model, since we calculate
2272                  cost for every peeling option.  */
2273               poly_uint64 nscalars = npeel_tmp;
2274               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2275                 {
2276                   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2277                   nscalars = (STMT_SLP_TYPE (stmt_info)
2278                               ? vf * DR_GROUP_SIZE (stmt_info) : vf);
2279                 }
2280
2281               /* Save info about DR in the hash table.  Also include peeling
2282                  amounts according to the explanation above.  Indicate
2283                  the alignment status when the ref is not aligned.
2284                  ???  Rather than using unknown alignment here we should
2285                  prune all entries from the peeling hashtable which cause
2286                  DRs to be not supported.  */
2287               bool supportable_if_not_aligned
2288                 = vect_supportable_dr_alignment
2289                     (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2290               while (known_le (npeel_tmp, nscalars))
2291                 {
2292                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2293                                             dr_info, npeel_tmp,
2294                                             supportable_if_not_aligned);
2295                   npeel_tmp += MAX (1, target_align / dr_size);
2296                 }
2297
2298               one_misalignment_known = true;
2299             }
2300           else
2301             {
2302               /* If we don't know any misalignment values, we prefer
2303                  peeling for data-ref that has the maximum number of data-refs
2304                  with the same alignment, unless the target prefers to align
2305                  stores over load.  */
2306               unsigned same_align_drs = n_same_align_refs[i];
2307               if (!dr0_info
2308                   || dr0_same_align_drs < same_align_drs)
2309                 {
2310                   dr0_same_align_drs = same_align_drs;
2311                   dr0_info = dr_info;
2312                 }
2313               /* For data-refs with the same number of related
2314                  accesses prefer the one where the misalign
2315                  computation will be invariant in the outermost loop.  */
2316               else if (dr0_same_align_drs == same_align_drs)
2317                 {
2318                   class loop *ivloop0, *ivloop;
2319                   ivloop0 = outermost_invariant_loop_for_expr
2320                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
2321                   ivloop = outermost_invariant_loop_for_expr
2322                     (loop, DR_BASE_ADDRESS (dr));
2323                   if ((ivloop && !ivloop0)
2324                       || (ivloop && ivloop0
2325                           && flow_loop_nested_p (ivloop, ivloop0)))
2326                     dr0_info = dr_info;
2327                 }
2328
2329               one_misalignment_unknown = true;
2330
2331               /* Check for data refs with unsupportable alignment that
2332                  can be peeled.  */
2333               enum dr_alignment_support supportable_dr_alignment
2334                 = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2335                                                  DR_MISALIGNMENT_UNKNOWN);
2336               if (supportable_dr_alignment == dr_unaligned_unsupported)
2337                 {
2338                   one_dr_unsupportable = true;
2339                   unsupportable_dr_info = dr_info;
2340                 }
2341
2342               if (!first_store && DR_IS_WRITE (dr))
2343                 {
2344                   first_store = dr_info;
2345                   first_store_same_align_drs = same_align_drs;
2346                 }
2347             }
2348         }
2349       else
2350         {
2351           if (!aligned_access_p (dr_info, vectype))
2352             {
2353               if (dump_enabled_p ())
2354                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355                                  "vector alignment may not be reachable\n");
2356               break;
2357             }
2358         }
2359     }
2360
2361   /* Check if we can possibly peel the loop.  */
2362   if (!vect_can_advance_ivs_p (loop_vinfo)
2363       || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2364                                        LOOP_VINFO_IV_EXIT (loop_vinfo))
2365       || loop->inner)
2366     do_peeling = false;
2367
2368   struct _vect_peel_extended_info peel_for_known_alignment;
2369   struct _vect_peel_extended_info peel_for_unknown_alignment;
2370   struct _vect_peel_extended_info best_peel;
2371
2372   peel_for_unknown_alignment.inside_cost = INT_MAX;
2373   peel_for_unknown_alignment.outside_cost = INT_MAX;
2374   peel_for_unknown_alignment.peel_info.count = 0;
2375
2376   if (do_peeling
2377       && one_misalignment_unknown)
2378     {
2379       /* Check if the target requires to prefer stores over loads, i.e., if
2380          misaligned stores are more expensive than misaligned loads (taking
2381          drs with same alignment into account).  */
2382       unsigned int load_inside_cost = 0;
2383       unsigned int load_outside_cost = 0;
2384       unsigned int store_inside_cost = 0;
2385       unsigned int store_outside_cost = 0;
2386       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2387
2388       stmt_vector_for_cost dummy;
2389       dummy.create (2);
2390       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2391                                       &load_inside_cost,
2392                                       &load_outside_cost,
2393                                       &dummy, &dummy, estimated_npeels);
2394       dummy.release ();
2395
2396       if (first_store)
2397         {
2398           dummy.create (2);
2399           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2400                                           &store_inside_cost,
2401                                           &store_outside_cost,
2402                                           &dummy, &dummy,
2403                                           estimated_npeels);
2404           dummy.release ();
2405         }
2406       else
2407         {
2408           store_inside_cost = INT_MAX;
2409           store_outside_cost = INT_MAX;
2410         }
2411
2412       if (load_inside_cost > store_inside_cost
2413           || (load_inside_cost == store_inside_cost
2414               && load_outside_cost > store_outside_cost))
2415         {
2416           dr0_info = first_store;
2417           dr0_same_align_drs = first_store_same_align_drs;
2418           peel_for_unknown_alignment.inside_cost = store_inside_cost;
2419           peel_for_unknown_alignment.outside_cost = store_outside_cost;
2420         }
2421       else
2422         {
2423           peel_for_unknown_alignment.inside_cost = load_inside_cost;
2424           peel_for_unknown_alignment.outside_cost = load_outside_cost;
2425         }
2426
2427       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2428       prologue_cost_vec.create (2);
2429       epilogue_cost_vec.create (2);
2430
2431       int dummy2;
2432       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2433         (loop_vinfo, estimated_npeels, &dummy2,
2434          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2435          &prologue_cost_vec, &epilogue_cost_vec);
2436
2437       prologue_cost_vec.release ();
2438       epilogue_cost_vec.release ();
2439
2440       peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2441     }
2442
2443   peel_for_unknown_alignment.peel_info.npeel = 0;
2444   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2445
2446   best_peel = peel_for_unknown_alignment;
2447
2448   peel_for_known_alignment.inside_cost = INT_MAX;
2449   peel_for_known_alignment.outside_cost = INT_MAX;
2450   peel_for_known_alignment.peel_info.count = 0;
2451   peel_for_known_alignment.peel_info.dr_info = NULL;
2452
2453   if (do_peeling && one_misalignment_known)
2454     {
2455       /* Peeling is possible, but there is no data access that is not supported
2456          unless aligned.  So we try to choose the best possible peeling from
2457          the hash table.  */
2458       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2459         (&peeling_htab, loop_vinfo);
2460     }
2461
2462   /* Compare costs of peeling for known and unknown alignment. */
2463   if (peel_for_known_alignment.peel_info.dr_info != NULL
2464       && peel_for_unknown_alignment.inside_cost
2465       >= peel_for_known_alignment.inside_cost)
2466     {
2467       best_peel = peel_for_known_alignment;
2468
2469       /* If the best peeling for known alignment has NPEEL == 0, perform no
2470          peeling at all except if there is an unsupportable dr that we can
2471          align.  */
2472       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2473         do_peeling = false;
2474     }
2475
2476   /* If there is an unsupportable data ref, prefer this over all choices so far
2477      since we'd have to discard a chosen peeling except when it accidentally
2478      aligned the unsupportable data ref.  */
2479   if (one_dr_unsupportable)
2480     dr0_info = unsupportable_dr_info;
2481   else if (do_peeling)
2482     {
2483       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2484          TODO: Use nopeel_outside_cost or get rid of it?  */
2485       unsigned nopeel_inside_cost = 0;
2486       unsigned nopeel_outside_cost = 0;
2487
2488       stmt_vector_for_cost dummy;
2489       dummy.create (2);
2490       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2491                                       &nopeel_outside_cost, &dummy, &dummy, 0);
2492       dummy.release ();
2493
2494       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2495          costs will be recorded.  */
2496       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2497       prologue_cost_vec.create (2);
2498       epilogue_cost_vec.create (2);
2499
2500       int dummy2;
2501       nopeel_outside_cost += vect_get_known_peeling_cost
2502         (loop_vinfo, 0, &dummy2,
2503          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2504          &prologue_cost_vec, &epilogue_cost_vec);
2505
2506       prologue_cost_vec.release ();
2507       epilogue_cost_vec.release ();
2508
2509       npeel = best_peel.peel_info.npeel;
2510       dr0_info = best_peel.peel_info.dr_info;
2511
2512       /* If no peeling is not more expensive than the best peeling we
2513          have so far, don't perform any peeling.  */
2514       if (nopeel_inside_cost <= best_peel.inside_cost)
2515         do_peeling = false;
2516     }
2517
2518   if (do_peeling)
2519     {
2520       stmt_vec_info stmt_info = dr0_info->stmt;
2521       if (known_alignment_for_access_p (dr0_info,
2522                                         STMT_VINFO_VECTYPE (stmt_info)))
2523         {
2524           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2525                                                 size_zero_node) < 0;
2526           if (!npeel)
2527             {
2528               /* Since it's known at compile time, compute the number of
2529                  iterations in the peeled loop (the peeling factor) for use in
2530                  updating DR_MISALIGNMENT values.  The peeling factor is the
2531                  vectorization factor minus the misalignment as an element
2532                  count.  */
2533               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2534               poly_int64 off = 0;
2535               if (negative)
2536                 off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2537                        * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2538               unsigned int mis
2539                 = dr_misalignment (dr0_info, vectype, off);
2540               mis = negative ? mis : -mis;
2541               /* If known_alignment_for_access_p then we have set
2542                  DR_MISALIGNMENT which is only done if we know it at compiler
2543                  time, so it is safe to assume target alignment is constant.
2544                */
2545               unsigned int target_align =
2546                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2547               npeel = ((mis & (target_align - 1))
2548                        / vect_get_scalar_dr_size (dr0_info));
2549             }
2550
2551           /* For interleaved data access every iteration accesses all the
2552              members of the group, therefore we divide the number of iterations
2553              by the group size.  */
2554           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2555             npeel /= DR_GROUP_SIZE (stmt_info);
2556
2557           if (dump_enabled_p ())
2558             dump_printf_loc (MSG_NOTE, vect_location,
2559                              "Try peeling by %d\n", npeel);
2560         }
2561
2562       /* Ensure that all datarefs can be vectorized after the peel.  */
2563       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2564         do_peeling = false;
2565
2566       /* Check if all datarefs are supportable and log.  */
2567       if (do_peeling
2568           && npeel == 0
2569           && known_alignment_for_access_p (dr0_info,
2570                                            STMT_VINFO_VECTYPE (stmt_info)))
2571         return opt_result::success ();
2572
2573       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2574       if (do_peeling)
2575         {
2576           unsigned max_allowed_peel
2577             = param_vect_max_peeling_for_alignment;
2578           if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2579             max_allowed_peel = 0;
2580           if (max_allowed_peel != (unsigned)-1)
2581             {
2582               unsigned max_peel = npeel;
2583               if (max_peel == 0)
2584                 {
2585                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2586                   unsigned HOST_WIDE_INT target_align_c;
2587                   if (target_align.is_constant (&target_align_c))
2588                     max_peel =
2589                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2590                   else
2591                     {
2592                       do_peeling = false;
2593                       if (dump_enabled_p ())
2594                         dump_printf_loc (MSG_NOTE, vect_location,
2595                           "Disable peeling, max peels set and vector"
2596                           " alignment unknown\n");
2597                     }
2598                 }
2599               if (max_peel > max_allowed_peel)
2600                 {
2601                   do_peeling = false;
2602                   if (dump_enabled_p ())
2603                     dump_printf_loc (MSG_NOTE, vect_location,
2604                         "Disable peeling, max peels reached: %d\n", max_peel);
2605                 }
2606             }
2607         }
2608
2609       /* Cost model #2 - if peeling may result in a remaining loop not
2610          iterating enough to be vectorized then do not peel.  Since this
2611          is a cost heuristic rather than a correctness decision, use the
2612          most likely runtime value for variable vectorization factors.  */
2613       if (do_peeling
2614           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2615         {
2616           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2617           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2618           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2619               < assumed_vf + max_peel)
2620             do_peeling = false;
2621         }
2622
2623       if (do_peeling)
2624         {
2625           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2626              If the misalignment of DR_i is identical to that of dr0 then set
2627              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2628              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2629              by the peeling factor times the element size of DR_i (MOD the
2630              vectorization factor times the size).  Otherwise, the
2631              misalignment of DR_i must be set to unknown.  */
2632           FOR_EACH_VEC_ELT (datarefs, i, dr)
2633             if (dr != dr0_info->dr)
2634               {
2635                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2636                 if (!vect_relevant_for_alignment_p (dr_info))
2637                   continue;
2638
2639                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2640               }
2641
2642           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2643           if (npeel)
2644             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2645           else
2646             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2647           SET_DR_MISALIGNMENT (dr0_info,
2648                                vect_dr_misalign_for_aligned_access (dr0_info));
2649           if (dump_enabled_p ())
2650             {
2651               dump_printf_loc (MSG_NOTE, vect_location,
2652                                "Alignment of access forced using peeling.\n");
2653               dump_printf_loc (MSG_NOTE, vect_location,
2654                                "Peeling for alignment will be applied.\n");
2655             }
2656
2657           /* The inside-loop cost will be accounted for in vectorizable_load
2658              and vectorizable_store correctly with adjusted alignments.
2659              Drop the body_cst_vec on the floor here.  */
2660           return opt_result::success ();
2661         }
2662     }
2663
2664   /* (2) Versioning to force alignment.  */
2665
2666   /* Try versioning if:
2667      1) optimize loop for speed and the cost-model is not cheap
2668      2) there is at least one unsupported misaligned data ref with an unknown
2669         misalignment, and
2670      3) all misaligned data refs with a known misalignment are supported, and
2671      4) the number of runtime alignment checks is within reason.  */
2672
2673   do_versioning
2674     = (optimize_loop_nest_for_speed_p (loop)
2675        && !loop->inner /* FORNOW */
2676        && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2677
2678   if (do_versioning)
2679     {
2680       FOR_EACH_VEC_ELT (datarefs, i, dr)
2681         {
2682           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2683           if (!vect_relevant_for_alignment_p (dr_info))
2684             continue;
2685
2686           stmt_vec_info stmt_info = dr_info->stmt;
2687           if (STMT_VINFO_STRIDED_P (stmt_info))
2688             {
2689               do_versioning = false;
2690               break;
2691             }
2692
2693           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2694           bool negative = tree_int_cst_compare (DR_STEP (dr),
2695                                                 size_zero_node) < 0;
2696           poly_int64 off = 0;
2697           if (negative)
2698             off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2699                    * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2700           int misalignment;
2701           if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2702             continue;
2703
2704           enum dr_alignment_support supportable_dr_alignment
2705             = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2706                                              misalignment);
2707           if (supportable_dr_alignment == dr_unaligned_unsupported)
2708             {
2709               if (misalignment != DR_MISALIGNMENT_UNKNOWN
2710                   || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2711                       >= (unsigned) param_vect_max_version_for_alignment_checks))
2712                 {
2713                   do_versioning = false;
2714                   break;
2715                 }
2716
2717               /* At present we don't support versioning for alignment
2718                  with variable VF, since there's no guarantee that the
2719                  VF is a power of two.  We could relax this if we added
2720                  a way of enforcing a power-of-two size.  */
2721               unsigned HOST_WIDE_INT size;
2722               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2723                 {
2724                   do_versioning = false;
2725                   break;
2726                 }
2727
2728               /* Forcing alignment in the first iteration is no good if
2729                  we don't keep it across iterations.  For now, just disable
2730                  versioning in this case.
2731                  ?? We could actually unroll the loop to achieve the required
2732                  overall step alignment, and forcing the alignment could be
2733                  done by doing some iterations of the non-vectorized loop.  */
2734               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2735                                * DR_STEP_ALIGNMENT (dr),
2736                                DR_TARGET_ALIGNMENT (dr_info)))
2737                 {
2738                   do_versioning = false;
2739                   break;
2740                 }
2741
2742               /* The rightmost bits of an aligned address must be zeros.
2743                  Construct the mask needed for this test.  For example,
2744                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2745                  mask must be 15 = 0xf. */
2746               int mask = size - 1;
2747
2748               /* FORNOW: use the same mask to test all potentially unaligned
2749                  references in the loop.  */
2750               if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2751                   && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2752                 {
2753                   do_versioning = false;
2754                   break;
2755                 }
2756
2757               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2758               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2759             }
2760         }
2761
2762       /* Versioning requires at least one misaligned data reference.  */
2763       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2764         do_versioning = false;
2765       else if (!do_versioning)
2766         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2767     }
2768
2769   if (do_versioning)
2770     {
2771       const vec<stmt_vec_info> &may_misalign_stmts
2772         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2773       stmt_vec_info stmt_info;
2774
2775       /* It can now be assumed that the data references in the statements
2776          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2777          of the loop being vectorized.  */
2778       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2779         {
2780           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2781           SET_DR_MISALIGNMENT (dr_info,
2782                                vect_dr_misalign_for_aligned_access (dr_info));
2783           if (dump_enabled_p ())
2784             dump_printf_loc (MSG_NOTE, vect_location,
2785                              "Alignment of access forced using versioning.\n");
2786         }
2787
2788       if (dump_enabled_p ())
2789         dump_printf_loc (MSG_NOTE, vect_location,
2790                          "Versioning for alignment will be applied.\n");
2791
2792       /* Peeling and versioning can't be done together at this time.  */
2793       gcc_assert (! (do_peeling && do_versioning));
2794
2795       return opt_result::success ();
2796     }
2797
2798   /* This point is reached if neither peeling nor versioning is being done.  */
2799   gcc_assert (! (do_peeling || do_versioning));
2800
2801   return opt_result::success ();
2802 }
2803
2804
2805 /* Function vect_analyze_data_refs_alignment
2806
2807    Analyze the alignment of the data-references in the loop.
2808    Return FALSE if a data reference is found that cannot be vectorized.  */
2809
2810 opt_result
2811 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2812 {
2813   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2814
2815   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2816   struct data_reference *dr;
2817   unsigned int i;
2818
2819   vect_record_base_alignments (loop_vinfo);
2820   FOR_EACH_VEC_ELT (datarefs, i, dr)
2821     {
2822       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2823       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2824         {
2825           if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2826               && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2827             continue;
2828           vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2829                                            STMT_VINFO_VECTYPE (dr_info->stmt));
2830         }
2831     }
2832
2833   return opt_result::success ();
2834 }
2835
2836
2837 /* Analyze alignment of DRs of stmts in NODE.  */
2838
2839 static bool
2840 vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2841 {
2842   /* Alignment is maintained in the first element of the group.  */
2843   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2844   first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2845   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2846   tree vectype = SLP_TREE_VECTYPE (node);
2847   poly_uint64 vector_alignment
2848     = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2849                  BITS_PER_UNIT);
2850   if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2851     vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2852   /* Re-analyze alignment when we're facing a vectorization with a bigger
2853      alignment requirement.  */
2854   else if (known_lt (dr_info->target_alignment, vector_alignment))
2855     {
2856       poly_uint64 old_target_alignment = dr_info->target_alignment;
2857       int old_misalignment = dr_info->misalignment;
2858       vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2859       /* But keep knowledge about a smaller alignment.  */
2860       if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2861           && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2862         {
2863           dr_info->target_alignment = old_target_alignment;
2864           dr_info->misalignment = old_misalignment;
2865         }
2866     }
2867   /* When we ever face unordered target alignments the first one wins in terms
2868      of analyzing and the other will become unknown in dr_misalignment.  */
2869   return true;
2870 }
2871
2872 /* Function vect_slp_analyze_instance_alignment
2873
2874    Analyze the alignment of the data-references in the SLP instance.
2875    Return FALSE if a data reference is found that cannot be vectorized.  */
2876
2877 bool
2878 vect_slp_analyze_instance_alignment (vec_info *vinfo,
2879                                                 slp_instance instance)
2880 {
2881   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
2882
2883   slp_tree node;
2884   unsigned i;
2885   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2886     if (! vect_slp_analyze_node_alignment (vinfo, node))
2887       return false;
2888
2889   if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2890       && ! vect_slp_analyze_node_alignment
2891              (vinfo, SLP_INSTANCE_TREE (instance)))
2892     return false;
2893
2894   return true;
2895 }
2896
2897
2898 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2899    accesses of legal size, step, etc.  Detect gaps, single element
2900    interleaving, and other special cases. Set grouped access info.
2901    Collect groups of strided stores for further use in SLP analysis.
2902    Worker for vect_analyze_group_access.  */
2903
2904 static bool
2905 vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
2906 {
2907   data_reference *dr = dr_info->dr;
2908   tree step = DR_STEP (dr);
2909   tree scalar_type = TREE_TYPE (DR_REF (dr));
2910   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2911   stmt_vec_info stmt_info = dr_info->stmt;
2912   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2913   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
2914   HOST_WIDE_INT dr_step = -1;
2915   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2916   bool slp_impossible = false;
2917
2918   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2919      size of the interleaving group (including gaps).  */
2920   if (tree_fits_shwi_p (step))
2921     {
2922       dr_step = tree_to_shwi (step);
2923       /* Check that STEP is a multiple of type size.  Otherwise there is
2924          a non-element-sized gap at the end of the group which we
2925          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2926          ???  As we can handle non-constant step fine here we should
2927          simply remove uses of DR_GROUP_GAP between the last and first
2928          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2929          simply not include that gap.  */
2930       if ((dr_step % type_size) != 0)
2931         {
2932           if (dump_enabled_p ())
2933             dump_printf_loc (MSG_NOTE, vect_location,
2934                              "Step %T is not a multiple of the element size"
2935                              " for %T\n",
2936                              step, DR_REF (dr));
2937           return false;
2938         }
2939       groupsize = absu_hwi (dr_step) / type_size;
2940     }
2941   else
2942     groupsize = 0;
2943
2944   /* Not consecutive access is possible only if it is a part of interleaving.  */
2945   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2946     {
2947       /* Check if it this DR is a part of interleaving, and is a single
2948          element of the group that is accessed in the loop.  */
2949
2950       /* Gaps are supported only for loads. STEP must be a multiple of the type
2951          size.  */
2952       if (DR_IS_READ (dr)
2953           && (dr_step % type_size) == 0
2954           && groupsize > 0
2955           /* This could be UINT_MAX but as we are generating code in a very
2956              inefficient way we have to cap earlier.
2957              See PR91403 for example.  */
2958           && groupsize <= 4096)
2959         {
2960           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2961           DR_GROUP_SIZE (stmt_info) = groupsize;
2962           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2963           if (dump_enabled_p ())
2964             dump_printf_loc (MSG_NOTE, vect_location,
2965                              "Detected single element interleaving %T"
2966                              " step %T\n",
2967                              DR_REF (dr), step);
2968
2969           return true;
2970         }
2971
2972       if (dump_enabled_p ())
2973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2974                          "not consecutive access %G", stmt_info->stmt);
2975
2976       if (bb_vinfo)
2977         {
2978           /* Mark the statement as unvectorizable.  */
2979           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2980           return true;
2981         }
2982
2983       if (dump_enabled_p ())
2984         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2985       STMT_VINFO_STRIDED_P (stmt_info) = true;
2986       return true;
2987     }
2988
2989   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2990     {
2991       /* First stmt in the interleaving chain. Check the chain.  */
2992       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2993       struct data_reference *data_ref = dr;
2994       unsigned int count = 1;
2995       tree prev_init = DR_INIT (data_ref);
2996       HOST_WIDE_INT diff, gaps = 0;
2997
2998       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2999       while (next)
3000         {
3001           /* We never have the same DR multiple times.  */
3002           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3003                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3004
3005           data_ref = STMT_VINFO_DATA_REF (next);
3006
3007           /* All group members have the same STEP by construction.  */
3008           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3009
3010           /* Check that the distance between two accesses is equal to the type
3011              size. Otherwise, we have gaps.  */
3012           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3013                   - TREE_INT_CST_LOW (prev_init)) / type_size;
3014           if (diff < 1 || diff > UINT_MAX)
3015             {
3016               /* For artificial testcases with array accesses with large
3017                  constant indices we can run into overflow issues which
3018                  can end up fooling the groupsize constraint below so
3019                  check the individual gaps (which are represented as
3020                  unsigned int) as well.  */
3021               if (dump_enabled_p ())
3022                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3023                                  "interleaved access with gap larger "
3024                                  "than representable\n");
3025               return false;
3026             }
3027           if (diff != 1)
3028             {
3029               /* FORNOW: SLP of accesses with gaps is not supported.  */
3030               slp_impossible = true;
3031               if (DR_IS_WRITE (data_ref))
3032                 {
3033                   if (dump_enabled_p ())
3034                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3035                                      "interleaved store with gaps\n");
3036                   return false;
3037                 }
3038
3039               gaps += diff - 1;
3040             }
3041
3042           last_accessed_element += diff;
3043
3044           /* Store the gap from the previous member of the group. If there is no
3045              gap in the access, DR_GROUP_GAP is always 1.  */
3046           DR_GROUP_GAP (next) = diff;
3047
3048           prev_init = DR_INIT (data_ref);
3049           next = DR_GROUP_NEXT_ELEMENT (next);
3050           /* Count the number of data-refs in the chain.  */
3051           count++;
3052         }
3053
3054       if (groupsize == 0)
3055         groupsize = count + gaps;
3056
3057       /* This could be UINT_MAX but as we are generating code in a very
3058          inefficient way we have to cap earlier.  See PR78699 for example.  */
3059       if (groupsize > 4096)
3060         {
3061           if (dump_enabled_p ())
3062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063                              "group is too large\n");
3064           return false;
3065         }
3066
3067       /* Check that the size of the interleaving is equal to count for stores,
3068          i.e., that there are no gaps.  */
3069       if (groupsize != count
3070           && !DR_IS_READ (dr))
3071         {
3072           groupsize = count;
3073           STMT_VINFO_STRIDED_P (stmt_info) = true;
3074         }
3075
3076       /* If there is a gap after the last load in the group it is the
3077          difference between the groupsize and the last accessed
3078          element.
3079          When there is no gap, this difference should be 0.  */
3080       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3081
3082       DR_GROUP_SIZE (stmt_info) = groupsize;
3083       if (dump_enabled_p ())
3084         {
3085           dump_printf_loc (MSG_NOTE, vect_location,
3086                            "Detected interleaving ");
3087           if (DR_IS_READ (dr))
3088             dump_printf (MSG_NOTE, "load ");
3089           else if (STMT_VINFO_STRIDED_P (stmt_info))
3090             dump_printf (MSG_NOTE, "strided store ");
3091           else
3092             dump_printf (MSG_NOTE, "store ");
3093           dump_printf (MSG_NOTE, "of size %u\n",
3094                        (unsigned)groupsize);
3095           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3096           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3097           while (next)
3098             {
3099               if (DR_GROUP_GAP (next) != 1)
3100                 dump_printf_loc (MSG_NOTE, vect_location,
3101                                  "\t<gap of %d elements>\n",
3102                                  DR_GROUP_GAP (next) - 1);
3103               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3104               next = DR_GROUP_NEXT_ELEMENT (next);
3105             }
3106           if (DR_GROUP_GAP (stmt_info) != 0)
3107             dump_printf_loc (MSG_NOTE, vect_location,
3108                              "\t<gap of %d elements>\n",
3109                              DR_GROUP_GAP (stmt_info));
3110         }
3111
3112       /* SLP: create an SLP data structure for every interleaving group of
3113          stores for further analysis in vect_analyse_slp.  */
3114       if (DR_IS_WRITE (dr) && !slp_impossible)
3115         {
3116           if (loop_vinfo)
3117             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3118           if (bb_vinfo)
3119             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3120         }
3121     }
3122
3123   return true;
3124 }
3125
3126 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3127    accesses of legal size, step, etc.  Detect gaps, single element
3128    interleaving, and other special cases. Set grouped access info.
3129    Collect groups of strided stores for further use in SLP analysis.  */
3130
3131 static bool
3132 vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3133 {
3134   if (!vect_analyze_group_access_1 (vinfo, dr_info))
3135     {
3136       /* Dissolve the group if present.  */
3137       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3138       while (stmt_info)
3139         {
3140           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3141           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3142           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3143           stmt_info = next;
3144         }
3145       return false;
3146     }
3147   return true;
3148 }
3149
3150 /* Analyze the access pattern of the data-reference DR_INFO.
3151    In case of non-consecutive accesses call vect_analyze_group_access() to
3152    analyze groups of accesses.  */
3153
3154 static bool
3155 vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3156 {
3157   data_reference *dr = dr_info->dr;
3158   tree step = DR_STEP (dr);
3159   tree scalar_type = TREE_TYPE (DR_REF (dr));
3160   stmt_vec_info stmt_info = dr_info->stmt;
3161   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3162   class loop *loop = NULL;
3163
3164   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3165     return true;
3166
3167   if (loop_vinfo)
3168     loop = LOOP_VINFO_LOOP (loop_vinfo);
3169
3170   if (loop_vinfo && !step)
3171     {
3172       if (dump_enabled_p ())
3173         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3174                          "bad data-ref access in loop\n");
3175       return false;
3176     }
3177
3178   /* Allow loads with zero step in inner-loop vectorization.  */
3179   if (loop_vinfo && integer_zerop (step))
3180     {
3181       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3182       if (!nested_in_vect_loop_p (loop, stmt_info))
3183         return DR_IS_READ (dr);
3184       /* Allow references with zero step for outer loops marked
3185          with pragma omp simd only - it guarantees absence of
3186          loop-carried dependencies between inner loop iterations.  */
3187       if (loop->safelen < 2)
3188         {
3189           if (dump_enabled_p ())
3190             dump_printf_loc (MSG_NOTE, vect_location,
3191                              "zero step in inner loop of nest\n");
3192           return false;
3193         }
3194     }
3195
3196   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3197     {
3198       /* Interleaved accesses are not yet supported within outer-loop
3199         vectorization for references in the inner-loop.  */
3200       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3201
3202       /* For the rest of the analysis we use the outer-loop step.  */
3203       step = STMT_VINFO_DR_STEP (stmt_info);
3204       if (integer_zerop (step))
3205         {
3206           if (dump_enabled_p ())
3207             dump_printf_loc (MSG_NOTE, vect_location,
3208                              "zero step in outer loop.\n");
3209           return DR_IS_READ (dr);
3210         }
3211     }
3212
3213   /* Consecutive?  */
3214   if (TREE_CODE (step) == INTEGER_CST)
3215     {
3216       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3217       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3218           || (dr_step < 0
3219               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3220         {
3221           /* Mark that it is not interleaving.  */
3222           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3223           return true;
3224         }
3225     }
3226
3227   if (loop && nested_in_vect_loop_p (loop, stmt_info))
3228     {
3229       if (dump_enabled_p ())
3230         dump_printf_loc (MSG_NOTE, vect_location,
3231                          "grouped access in outer loop.\n");
3232       return false;
3233     }
3234
3235
3236   /* Assume this is a DR handled by non-constant strided load case.  */
3237   if (TREE_CODE (step) != INTEGER_CST)
3238     return (STMT_VINFO_STRIDED_P (stmt_info)
3239             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3240                 || vect_analyze_group_access (vinfo, dr_info)));
3241
3242   /* Not consecutive access - check if it's a part of interleaving group.  */
3243   return vect_analyze_group_access (vinfo, dr_info);
3244 }
3245
3246 /* Compare two data-references DRA and DRB to group them into chunks
3247    suitable for grouping.  */
3248
3249 static int
3250 dr_group_sort_cmp (const void *dra_, const void *drb_)
3251 {
3252   dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3253   dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3254   data_reference_p dra = dra_info->dr;
3255   data_reference_p drb = drb_info->dr;
3256   int cmp;
3257
3258   /* Stabilize sort.  */
3259   if (dra == drb)
3260     return 0;
3261
3262   /* Different group IDs lead never belong to the same group.  */
3263   if (dra_info->group != drb_info->group)
3264     return dra_info->group < drb_info->group ? -1 : 1;
3265
3266   /* Ordering of DRs according to base.  */
3267   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3268                                DR_BASE_ADDRESS (drb));
3269   if (cmp != 0)
3270     return cmp;
3271
3272   /* And according to DR_OFFSET.  */
3273   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3274   if (cmp != 0)
3275     return cmp;
3276
3277   /* Put reads before writes.  */
3278   if (DR_IS_READ (dra) != DR_IS_READ (drb))
3279     return DR_IS_READ (dra) ? -1 : 1;
3280
3281   /* Then sort after access size.  */
3282   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3283                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3284   if (cmp != 0)
3285     return cmp;
3286
3287   /* And after step.  */
3288   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3289   if (cmp != 0)
3290     return cmp;
3291
3292   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
3293   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3294   if (cmp == 0)
3295     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3296   return cmp;
3297 }
3298
3299 /* If OP is the result of a conversion, return the unconverted value,
3300    otherwise return null.  */
3301
3302 static tree
3303 strip_conversion (tree op)
3304 {
3305   if (TREE_CODE (op) != SSA_NAME)
3306     return NULL_TREE;
3307   gimple *stmt = SSA_NAME_DEF_STMT (op);
3308   if (!is_gimple_assign (stmt)
3309       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3310     return NULL_TREE;
3311   return gimple_assign_rhs1 (stmt);
3312 }
3313
3314 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3315    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
3316    be grouped in SLP mode.  */
3317
3318 static bool
3319 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3320                    bool allow_slp_p)
3321 {
3322   if (gimple_assign_single_p (stmt1_info->stmt))
3323     return gimple_assign_single_p (stmt2_info->stmt);
3324
3325   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3326   if (call1 && gimple_call_internal_p (call1))
3327     {
3328       /* Check for two masked loads or two masked stores.  */
3329       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3330       if (!call2 || !gimple_call_internal_p (call2))
3331         return false;
3332       internal_fn ifn = gimple_call_internal_fn (call1);
3333       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3334         return false;
3335       if (ifn != gimple_call_internal_fn (call2))
3336         return false;
3337
3338       /* Check that the masks are the same.  Cope with casts of masks,
3339          like those created by build_mask_conversion.  */
3340       tree mask1 = gimple_call_arg (call1, 2);
3341       tree mask2 = gimple_call_arg (call2, 2);
3342       if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3343         {
3344           mask1 = strip_conversion (mask1);
3345           if (!mask1)
3346             return false;
3347           mask2 = strip_conversion (mask2);
3348           if (!mask2)
3349             return false;
3350           if (!operand_equal_p (mask1, mask2, 0))
3351             return false;
3352         }
3353       return true;
3354     }
3355
3356   return false;
3357 }
3358
3359 /* Function vect_analyze_data_ref_accesses.
3360
3361    Analyze the access pattern of all the data references in the loop.
3362
3363    FORNOW: the only access pattern that is considered vectorizable is a
3364            simple step 1 (consecutive) access.
3365
3366    FORNOW: handle only arrays and pointer accesses.  */
3367
3368 opt_result
3369 vect_analyze_data_ref_accesses (vec_info *vinfo,
3370                                 vec<int> *dataref_groups)
3371 {
3372   unsigned int i;
3373   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3374
3375   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3376
3377   if (datarefs.is_empty ())
3378     return opt_result::success ();
3379
3380   /* Sort the array of datarefs to make building the interleaving chains
3381      linear.  Don't modify the original vector's order, it is needed for
3382      determining what dependencies are reversed.  */
3383   vec<dr_vec_info *> datarefs_copy;
3384   datarefs_copy.create (datarefs.length ());
3385   for (unsigned i = 0; i < datarefs.length (); i++)
3386     {
3387       dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3388       /* If the caller computed DR grouping use that, otherwise group by
3389          basic blocks.  */
3390       if (dataref_groups)
3391         dr_info->group = (*dataref_groups)[i];
3392       else
3393         dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3394       datarefs_copy.quick_push (dr_info);
3395     }
3396   datarefs_copy.qsort (dr_group_sort_cmp);
3397   hash_set<stmt_vec_info> to_fixup;
3398
3399   /* Build the interleaving chains.  */
3400   for (i = 0; i < datarefs_copy.length () - 1;)
3401     {
3402       dr_vec_info *dr_info_a = datarefs_copy[i];
3403       data_reference_p dra = dr_info_a->dr;
3404       int dra_group_id = dr_info_a->group;
3405       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3406       stmt_vec_info lastinfo = NULL;
3407       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3408           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3409         {
3410           ++i;
3411           continue;
3412         }
3413       for (i = i + 1; i < datarefs_copy.length (); ++i)
3414         {
3415           dr_vec_info *dr_info_b = datarefs_copy[i];
3416           data_reference_p drb = dr_info_b->dr;
3417           int drb_group_id = dr_info_b->group;
3418           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3419           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3420               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3421             break;
3422
3423           /* ???  Imperfect sorting (non-compatible types, non-modulo
3424              accesses, same accesses) can lead to a group to be artificially
3425              split here as we don't just skip over those.  If it really
3426              matters we can push those to a worklist and re-iterate
3427              over them.  The we can just skip ahead to the next DR here.  */
3428
3429           /* DRs in a different DR group should not be put into the same
3430              interleaving group.  */
3431           if (dra_group_id != drb_group_id)
3432             break;
3433
3434           /* Check that the data-refs have same first location (except init)
3435              and they are both either store or load (not load and store,
3436              not masked loads or stores).  */
3437           if (DR_IS_READ (dra) != DR_IS_READ (drb)
3438               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3439                                         DR_BASE_ADDRESS (drb)) != 0
3440               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3441               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3442             break;
3443
3444           /* Check that the data-refs have the same constant size.  */
3445           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3446           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3447           if (!tree_fits_uhwi_p (sza)
3448               || !tree_fits_uhwi_p (szb)
3449               || !tree_int_cst_equal (sza, szb))
3450             break;
3451
3452           /* Check that the data-refs have the same step.  */
3453           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3454             break;
3455
3456           /* Check the types are compatible.
3457              ???  We don't distinguish this during sorting.  */
3458           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3459                                    TREE_TYPE (DR_REF (drb))))
3460             break;
3461
3462           /* Check that the DR_INITs are compile-time constants.  */
3463           if (!tree_fits_shwi_p (DR_INIT (dra))
3464               || !tree_fits_shwi_p (DR_INIT (drb)))
3465             break;
3466
3467           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3468              just hold extra information.  */
3469           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3470               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3471               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3472             break;
3473
3474           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3475           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3476           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3477           HOST_WIDE_INT init_prev
3478             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3479           gcc_assert (init_a <= init_b
3480                       && init_a <= init_prev
3481                       && init_prev <= init_b);
3482
3483           /* Do not place the same access in the interleaving chain twice.  */
3484           if (init_b == init_prev)
3485             {
3486               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3487                           < gimple_uid (DR_STMT (drb)));
3488               /* Simply link in duplicates and fix up the chain below.  */
3489             }
3490           else
3491             {
3492               /* If init_b == init_a + the size of the type * k, we have an
3493                  interleaving, and DRA is accessed before DRB.  */
3494               unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3495               if (type_size_a == 0
3496                   || (((unsigned HOST_WIDE_INT)init_b - init_a)
3497                       % type_size_a != 0))
3498                 break;
3499
3500               /* If we have a store, the accesses are adjacent.  This splits
3501                  groups into chunks we support (we don't support vectorization
3502                  of stores with gaps).  */
3503               if (!DR_IS_READ (dra)
3504                   && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3505                       != type_size_a))
3506                 break;
3507
3508               /* If the step (if not zero or non-constant) is smaller than the
3509                  difference between data-refs' inits this splits groups into
3510                  suitable sizes.  */
3511               if (tree_fits_shwi_p (DR_STEP (dra)))
3512                 {
3513                   unsigned HOST_WIDE_INT step
3514                     = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3515                   if (step != 0
3516                       && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3517                     break;
3518                 }
3519             }
3520
3521           if (dump_enabled_p ())
3522             dump_printf_loc (MSG_NOTE, vect_location,
3523                              DR_IS_READ (dra)
3524                              ? "Detected interleaving load %T and %T\n"
3525                              : "Detected interleaving store %T and %T\n",
3526                              DR_REF (dra), DR_REF (drb));
3527
3528           /* Link the found element into the group list.  */
3529           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3530             {
3531               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3532               lastinfo = stmtinfo_a;
3533             }
3534           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3535           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3536           lastinfo = stmtinfo_b;
3537
3538           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3539             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3540
3541           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3542             dump_printf_loc (MSG_NOTE, vect_location,
3543                              "Load suitable for SLP vectorization only.\n");
3544
3545           if (init_b == init_prev
3546               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3547               && dump_enabled_p ())
3548             dump_printf_loc (MSG_NOTE, vect_location,
3549                              "Queuing group with duplicate access for fixup\n");
3550         }
3551     }
3552
3553   /* Fixup groups with duplicate entries by splitting it.  */
3554   while (1)
3555     {
3556       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3557       if (!(it != to_fixup.end ()))
3558         break;
3559       stmt_vec_info grp = *it;
3560       to_fixup.remove (grp);
3561
3562       /* Find the earliest duplicate group member.  */
3563       unsigned first_duplicate = -1u;
3564       stmt_vec_info next, g = grp;
3565       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3566         {
3567           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3568                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3569               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3570             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3571           g = next;
3572         }
3573       if (first_duplicate == -1U)
3574         continue;
3575
3576       /* Then move all stmts after the first duplicate to a new group.
3577          Note this is a heuristic but one with the property that *it
3578          is fixed up completely.  */
3579       g = grp;
3580       stmt_vec_info newgroup = NULL, ng = grp;
3581       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3582         {
3583           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3584             {
3585               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3586               if (!newgroup)
3587                 newgroup = next;
3588               else
3589                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3590               ng = next;
3591               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3592             }
3593           else
3594             g = DR_GROUP_NEXT_ELEMENT (g);
3595         }
3596       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3597
3598       /* Fixup the new group which still may contain duplicates.  */
3599       to_fixup.add (newgroup);
3600     }
3601
3602   dr_vec_info *dr_info;
3603   FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3604     {
3605       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3606           && !vect_analyze_data_ref_access (vinfo, dr_info))
3607         {
3608           if (dump_enabled_p ())
3609             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610                              "not vectorized: complicated access pattern.\n");
3611
3612           if (is_a <bb_vec_info> (vinfo))
3613             {
3614               /* Mark the statement as not vectorizable.  */
3615               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3616               continue;
3617             }
3618           else
3619             {
3620               datarefs_copy.release ();
3621               return opt_result::failure_at (dr_info->stmt->stmt,
3622                                              "not vectorized:"
3623                                              " complicated access pattern.\n");
3624             }
3625         }
3626     }
3627
3628   datarefs_copy.release ();
3629   return opt_result::success ();
3630 }
3631
3632 /* Function vect_vfa_segment_size.
3633
3634    Input:
3635      DR_INFO: The data reference.
3636      LENGTH_FACTOR: segment length to consider.
3637
3638    Return a value suitable for the dr_with_seg_len::seg_len field.
3639    This is the "distance travelled" by the pointer from the first
3640    iteration in the segment to the last.  Note that it does not include
3641    the size of the access; in effect it only describes the first byte.  */
3642
3643 static tree
3644 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3645 {
3646   length_factor = size_binop (MINUS_EXPR,
3647                               fold_convert (sizetype, length_factor),
3648                               size_one_node);
3649   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3650                      length_factor);
3651 }
3652
3653 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3654    gives the worst-case number of bytes covered by the segment.  */
3655
3656 static unsigned HOST_WIDE_INT
3657 vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3658 {
3659   stmt_vec_info stmt_vinfo = dr_info->stmt;
3660   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3661   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3662   unsigned HOST_WIDE_INT access_size = ref_size;
3663   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3664     {
3665       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3666       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3667     }
3668   tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3669   int misalignment;
3670   if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3671       && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3672       && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3673           == dr_explicit_realign_optimized))
3674     {
3675       /* We might access a full vector's worth.  */
3676       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3677     }
3678   return access_size;
3679 }
3680
3681 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3682    describes.  */
3683
3684 static unsigned int
3685 vect_vfa_align (dr_vec_info *dr_info)
3686 {
3687   return dr_alignment (dr_info->dr);
3688 }
3689
3690 /* Function vect_no_alias_p.
3691
3692    Given data references A and B with equal base and offset, see whether
3693    the alias relation can be decided at compilation time.  Return 1 if
3694    it can and the references alias, 0 if it can and the references do
3695    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3696    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3697    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3698
3699 static int
3700 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3701                          tree segment_length_a, tree segment_length_b,
3702                          unsigned HOST_WIDE_INT access_size_a,
3703                          unsigned HOST_WIDE_INT access_size_b)
3704 {
3705   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3706   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3707   poly_uint64 const_length_a;
3708   poly_uint64 const_length_b;
3709
3710   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3711      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3712      [a, a+12) */
3713   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3714     {
3715       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3716       offset_a -= const_length_a;
3717     }
3718   else
3719     const_length_a = tree_to_poly_uint64 (segment_length_a);
3720   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3721     {
3722       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3723       offset_b -= const_length_b;
3724     }
3725   else
3726     const_length_b = tree_to_poly_uint64 (segment_length_b);
3727
3728   const_length_a += access_size_a;
3729   const_length_b += access_size_b;
3730
3731   if (ranges_known_overlap_p (offset_a, const_length_a,
3732                               offset_b, const_length_b))
3733     return 1;
3734
3735   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3736                                offset_b, const_length_b))
3737     return 0;
3738
3739   return -1;
3740 }
3741
3742 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3743    in DDR is >= VF.  */
3744
3745 static bool
3746 dependence_distance_ge_vf (data_dependence_relation *ddr,
3747                            unsigned int loop_depth, poly_uint64 vf)
3748 {
3749   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3750       || DDR_NUM_DIST_VECTS (ddr) == 0)
3751     return false;
3752
3753   /* If the dependence is exact, we should have limited the VF instead.  */
3754   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3755
3756   unsigned int i;
3757   lambda_vector dist_v;
3758   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3759     {
3760       HOST_WIDE_INT dist = dist_v[loop_depth];
3761       if (dist != 0
3762           && !(dist > 0 && DDR_REVERSED_P (ddr))
3763           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3764         return false;
3765     }
3766
3767   if (dump_enabled_p ())
3768     dump_printf_loc (MSG_NOTE, vect_location,
3769                      "dependence distance between %T and %T is >= VF\n",
3770                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3771
3772   return true;
3773 }
3774
3775 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3776
3777 static void
3778 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3779 {
3780   dump_printf (dump_kind, "%s (%T) >= ",
3781                lower_bound.unsigned_p ? "unsigned" : "abs",
3782                lower_bound.expr);
3783   dump_dec (dump_kind, lower_bound.min_value);
3784 }
3785
3786 /* Record that the vectorized loop requires the vec_lower_bound described
3787    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3788
3789 static void
3790 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3791                         poly_uint64 min_value)
3792 {
3793   vec<vec_lower_bound> &lower_bounds
3794     = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3795   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3796     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3797       {
3798         unsigned_p &= lower_bounds[i].unsigned_p;
3799         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3800         if (lower_bounds[i].unsigned_p != unsigned_p
3801             || maybe_lt (lower_bounds[i].min_value, min_value))
3802           {
3803             lower_bounds[i].unsigned_p = unsigned_p;
3804             lower_bounds[i].min_value = min_value;
3805             if (dump_enabled_p ())
3806               {
3807                 dump_printf_loc (MSG_NOTE, vect_location,
3808                                  "updating run-time check to ");
3809                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3810                 dump_printf (MSG_NOTE, "\n");
3811               }
3812           }
3813         return;
3814       }
3815
3816   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3817   if (dump_enabled_p ())
3818     {
3819       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3820       dump_lower_bound (MSG_NOTE, lower_bound);
3821       dump_printf (MSG_NOTE, "\n");
3822     }
3823   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3824 }
3825
3826 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3827    will span fewer than GAP bytes.  */
3828
3829 static bool
3830 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3831                   poly_int64 gap)
3832 {
3833   stmt_vec_info stmt_info = dr_info->stmt;
3834   HOST_WIDE_INT count
3835     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3836   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3837     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3838   return (estimated_poly_value (gap)
3839           <= count * vect_get_scalar_dr_size (dr_info));
3840 }
3841
3842 /* Return true if we know that there is no alias between DR_INFO_A and
3843    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3844    When returning true, set *LOWER_BOUND_OUT to this N.  */
3845
3846 static bool
3847 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3848                                 poly_uint64 *lower_bound_out)
3849 {
3850   /* Check that there is a constant gap of known sign between DR_A
3851      and DR_B.  */
3852   data_reference *dr_a = dr_info_a->dr;
3853   data_reference *dr_b = dr_info_b->dr;
3854   poly_int64 init_a, init_b;
3855   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3856       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3857       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3858       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3859       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3860       || !ordered_p (init_a, init_b))
3861     return false;
3862
3863   /* Sort DR_A and DR_B by the address they access.  */
3864   if (maybe_lt (init_b, init_a))
3865     {
3866       std::swap (init_a, init_b);
3867       std::swap (dr_info_a, dr_info_b);
3868       std::swap (dr_a, dr_b);
3869     }
3870
3871   /* If the two accesses could be dependent within a scalar iteration,
3872      make sure that we'd retain their order.  */
3873   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3874       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3875     return false;
3876
3877   /* There is no alias if abs (DR_STEP) is greater than or equal to
3878      the bytes spanned by the combination of the two accesses.  */
3879   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3880   return true;
3881 }
3882
3883 /* Function vect_prune_runtime_alias_test_list.
3884
3885    Prune a list of ddrs to be tested at run-time by versioning for alias.
3886    Merge several alias checks into one if possible.
3887    Return FALSE if resulting list of ddrs is longer then allowed by
3888    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3889
3890 opt_result
3891 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3892 {
3893   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3894   hash_set <tree_pair_hash> compared_objects;
3895
3896   const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3897   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3898     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3899   const vec<vec_object_pair> &check_unequal_addrs
3900     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3901   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3902   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3903
3904   ddr_p ddr;
3905   unsigned int i;
3906   tree length_factor;
3907
3908   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3909
3910   /* Step values are irrelevant for aliasing if the number of vector
3911      iterations is equal to the number of scalar iterations (which can
3912      happen for fully-SLP loops).  */
3913   bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3914
3915   if (!vf_one_p)
3916     {
3917       /* Convert the checks for nonzero steps into bound tests.  */
3918       tree value;
3919       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3920         vect_check_lower_bound (loop_vinfo, value, true, 1);
3921     }
3922
3923   if (may_alias_ddrs.is_empty ())
3924     return opt_result::success ();
3925
3926   comp_alias_ddrs.create (may_alias_ddrs.length ());
3927
3928   unsigned int loop_depth
3929     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3930                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3931
3932   /* First, we collect all data ref pairs for aliasing checks.  */
3933   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3934     {
3935       poly_uint64 lower_bound;
3936       tree segment_length_a, segment_length_b;
3937       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3938       unsigned int align_a, align_b;
3939
3940       /* Ignore the alias if the VF we chose ended up being no greater
3941          than the dependence distance.  */
3942       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3943         continue;
3944
3945       if (DDR_OBJECT_A (ddr))
3946         {
3947           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3948           if (!compared_objects.add (new_pair))
3949             {
3950               if (dump_enabled_p ())
3951                 dump_printf_loc (MSG_NOTE, vect_location,
3952                                  "checking that %T and %T"
3953                                  " have different addresses\n",
3954                                  new_pair.first, new_pair.second);
3955               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3956             }
3957           continue;
3958         }
3959
3960       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3961       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3962
3963       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3964       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3965
3966       bool preserves_scalar_order_p
3967         = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3968       bool ignore_step_p
3969           = (vf_one_p
3970              && (preserves_scalar_order_p
3971                  || operand_equal_p (DR_STEP (dr_info_a->dr),
3972                                      DR_STEP (dr_info_b->dr))));
3973
3974       /* Skip the pair if inter-iteration dependencies are irrelevant
3975          and intra-iteration dependencies are guaranteed to be honored.  */
3976       if (ignore_step_p
3977           && (preserves_scalar_order_p
3978               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3979                                                  &lower_bound)))
3980         {
3981           if (dump_enabled_p ())
3982             dump_printf_loc (MSG_NOTE, vect_location,
3983                              "no need for alias check between "
3984                              "%T and %T when VF is 1\n",
3985                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3986           continue;
3987         }
3988
3989       /* See whether we can handle the alias using a bounds check on
3990          the step, and whether that's likely to be the best approach.
3991          (It might not be, for example, if the minimum step is much larger
3992          than the number of bytes handled by one vector iteration.)  */
3993       if (!ignore_step_p
3994           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3995           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3996                                              &lower_bound)
3997           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3998               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3999         {
4000           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4001           if (dump_enabled_p ())
4002             {
4003               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4004                                "%T and %T when the step %T is outside ",
4005                                DR_REF (dr_info_a->dr),
4006                                DR_REF (dr_info_b->dr),
4007                                DR_STEP (dr_info_a->dr));
4008               if (unsigned_p)
4009                 dump_printf (MSG_NOTE, "[0");
4010               else
4011                 {
4012                   dump_printf (MSG_NOTE, "(");
4013                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4014                 }
4015               dump_printf (MSG_NOTE, ", ");
4016               dump_dec (MSG_NOTE, lower_bound);
4017               dump_printf (MSG_NOTE, ")\n");
4018             }
4019           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4020                                   unsigned_p, lower_bound);
4021           continue;
4022         }
4023
4024       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4025       if (dr_group_first_a)
4026         {
4027           stmt_info_a = dr_group_first_a;
4028           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4029         }
4030
4031       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4032       if (dr_group_first_b)
4033         {
4034           stmt_info_b = dr_group_first_b;
4035           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4036         }
4037
4038       if (ignore_step_p)
4039         {
4040           segment_length_a = size_zero_node;
4041           segment_length_b = size_zero_node;
4042         }
4043       else
4044         {
4045           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4046                                 DR_STEP (dr_info_b->dr), 0))
4047             length_factor = scalar_loop_iters;
4048           else
4049             length_factor = size_int (vect_factor);
4050           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4051           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4052         }
4053       access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4054       access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4055       align_a = vect_vfa_align (dr_info_a);
4056       align_b = vect_vfa_align (dr_info_b);
4057
4058       /* See whether the alias is known at compilation time.  */
4059       if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4060                            DR_BASE_ADDRESS (dr_info_b->dr), 0)
4061           && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4062                               DR_OFFSET (dr_info_b->dr), 0)
4063           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4064           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4065           && poly_int_tree_p (segment_length_a)
4066           && poly_int_tree_p (segment_length_b))
4067         {
4068           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4069                                              segment_length_a,
4070                                              segment_length_b,
4071                                              access_size_a,
4072                                              access_size_b);
4073           if (res >= 0 && dump_enabled_p ())
4074             {
4075               dump_printf_loc (MSG_NOTE, vect_location,
4076                                "can tell at compile time that %T and %T",
4077                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4078               if (res == 0)
4079                 dump_printf (MSG_NOTE, " do not alias\n");
4080               else
4081                 dump_printf (MSG_NOTE, " alias\n");
4082             }
4083
4084           if (res == 0)
4085             continue;
4086
4087           if (res == 1)
4088             return opt_result::failure_at (stmt_info_b->stmt,
4089                                            "not vectorized:"
4090                                            " compilation time alias: %G%G",
4091                                            stmt_info_a->stmt,
4092                                            stmt_info_b->stmt);
4093         }
4094
4095       dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4096                             access_size_a, align_a);
4097       dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4098                             access_size_b, align_b);
4099       /* Canonicalize the order to be the one that's needed for accurate
4100          RAW, WAR and WAW flags, in cases where the data references are
4101          well-ordered.  The order doesn't really matter otherwise,
4102          but we might as well be consistent.  */
4103       if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4104         std::swap (dr_a, dr_b);
4105
4106       dr_with_seg_len_pair_t dr_with_seg_len_pair
4107         (dr_a, dr_b, (preserves_scalar_order_p
4108                       ? dr_with_seg_len_pair_t::WELL_ORDERED
4109                       : dr_with_seg_len_pair_t::REORDERED));
4110
4111       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4112     }
4113
4114   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4115
4116   unsigned int count = (comp_alias_ddrs.length ()
4117                         + check_unequal_addrs.length ());
4118
4119   if (count
4120       && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4121           == VECT_COST_MODEL_VERY_CHEAP))
4122     return opt_result::failure_at
4123       (vect_location, "would need a runtime alias check\n");
4124
4125   if (dump_enabled_p ())
4126     dump_printf_loc (MSG_NOTE, vect_location,
4127                      "improved number of alias checks from %d to %d\n",
4128                      may_alias_ddrs.length (), count);
4129   unsigned limit = param_vect_max_version_for_alias_checks;
4130   if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4131     limit = param_vect_max_version_for_alias_checks * 6 / 10;
4132   if (count > limit)
4133     return opt_result::failure_at
4134       (vect_location,
4135        "number of versioning for alias run-time tests exceeds %d "
4136        "(--param vect-max-version-for-alias-checks)\n", limit);
4137
4138   return opt_result::success ();
4139 }
4140
4141 /* Check whether we can use an internal function for a gather load
4142    or scatter store.  READ_P is true for loads and false for stores.
4143    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
4144    the type of the memory elements being loaded or stored.  OFFSET_TYPE
4145    is the type of the offset that is being applied to the invariant
4146    base address.  SCALE is the amount by which the offset should
4147    be multiplied *after* it has been converted to address width.
4148
4149    Return true if the function is supported, storing the function id in
4150    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
4151
4152 bool
4153 vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4154                           tree vectype, tree memory_type, tree offset_type,
4155                           int scale, internal_fn *ifn_out,
4156                           tree *offset_vectype_out)
4157 {
4158   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4159   unsigned int element_bits = vector_element_bits (vectype);
4160   if (element_bits != memory_bits)
4161     /* For now the vector elements must be the same width as the
4162        memory elements.  */
4163     return false;
4164
4165   /* Work out which function we need.  */
4166   internal_fn ifn, alt_ifn, alt_ifn2;
4167   if (read_p)
4168     {
4169       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4170       alt_ifn = IFN_MASK_GATHER_LOAD;
4171       /* When target supports MASK_LEN_GATHER_LOAD, we always
4172          use MASK_LEN_GATHER_LOAD regardless whether len and
4173          mask are valid or not.  */
4174       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4175     }
4176   else
4177     {
4178       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4179       alt_ifn = IFN_MASK_SCATTER_STORE;
4180       /* When target supports MASK_LEN_SCATTER_STORE, we always
4181          use MASK_LEN_SCATTER_STORE regardless whether len and
4182          mask are valid or not.  */
4183       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4184     }
4185
4186   for (;;)
4187     {
4188       tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4189       if (!offset_vectype)
4190         return false;
4191
4192       /* Test whether the target supports this combination.  */
4193       if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4194                                                   offset_vectype, scale))
4195         {
4196           *ifn_out = ifn;
4197           *offset_vectype_out = offset_vectype;
4198           return true;
4199         }
4200       else if (!masked_p
4201                && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4202                                                           memory_type,
4203                                                           offset_vectype,
4204                                                           scale))
4205         {
4206           *ifn_out = alt_ifn;
4207           *offset_vectype_out = offset_vectype;
4208           return true;
4209         }
4210       else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4211                                                        memory_type,
4212                                                        offset_vectype, scale))
4213         {
4214           *ifn_out = alt_ifn2;
4215           *offset_vectype_out = offset_vectype;
4216           return true;
4217         }
4218
4219       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4220           && TYPE_PRECISION (offset_type) >= element_bits)
4221         return false;
4222
4223       offset_type = build_nonstandard_integer_type
4224         (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4225     }
4226 }
4227
4228 /* STMT_INFO is a call to an internal gather load or scatter store function.
4229    Describe the operation in INFO.  */
4230
4231 static void
4232 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4233                                    gather_scatter_info *info)
4234 {
4235   gcall *call = as_a <gcall *> (stmt_info->stmt);
4236   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4237   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4238
4239   info->ifn = gimple_call_internal_fn (call);
4240   info->decl = NULL_TREE;
4241   info->base = gimple_call_arg (call, 0);
4242   info->offset = gimple_call_arg (call, 1);
4243   info->offset_dt = vect_unknown_def_type;
4244   info->offset_vectype = NULL_TREE;
4245   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4246   info->element_type = TREE_TYPE (vectype);
4247   info->memory_type = TREE_TYPE (DR_REF (dr));
4248 }
4249
4250 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4251    gather load or scatter store.  Describe the operation in *INFO if so.  */
4252
4253 bool
4254 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4255                            gather_scatter_info *info)
4256 {
4257   HOST_WIDE_INT scale = 1;
4258   poly_int64 pbitpos, pbitsize;
4259   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4260   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4261   tree offtype = NULL_TREE;
4262   tree decl = NULL_TREE, base, off;
4263   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4264   tree memory_type = TREE_TYPE (DR_REF (dr));
4265   machine_mode pmode;
4266   int punsignedp, reversep, pvolatilep = 0;
4267   internal_fn ifn;
4268   tree offset_vectype;
4269   bool masked_p = false;
4270
4271   /* See whether this is already a call to a gather/scatter internal function.
4272      If not, see whether it's a masked load or store.  */
4273   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4274   if (call && gimple_call_internal_p (call))
4275     {
4276       ifn = gimple_call_internal_fn (call);
4277       if (internal_gather_scatter_fn_p (ifn))
4278         {
4279           vect_describe_gather_scatter_call (stmt_info, info);
4280           return true;
4281         }
4282       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4283     }
4284
4285   /* True if we should aim to use internal functions rather than
4286      built-in functions.  */
4287   bool use_ifn_p = (DR_IS_READ (dr)
4288                     ? supports_vec_gather_load_p (TYPE_MODE (vectype))
4289                     : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4290
4291   base = DR_REF (dr);
4292   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4293      see if we can use the def stmt of the address.  */
4294   if (masked_p
4295       && TREE_CODE (base) == MEM_REF
4296       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4297       && integer_zerop (TREE_OPERAND (base, 1))
4298       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4299     {
4300       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4301       if (is_gimple_assign (def_stmt)
4302           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4303         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4304     }
4305
4306   /* The gather and scatter builtins need address of the form
4307      loop_invariant + vector * {1, 2, 4, 8}
4308      or
4309      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4310      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4311      of loop invariants/SSA_NAMEs defined in the loop, with casts,
4312      multiplications and additions in it.  To get a vector, we need
4313      a single SSA_NAME that will be defined in the loop and will
4314      contain everything that is not loop invariant and that can be
4315      vectorized.  The following code attempts to find such a preexistng
4316      SSA_NAME OFF and put the loop invariants into a tree BASE
4317      that can be gimplified before the loop.  */
4318   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4319                               &punsignedp, &reversep, &pvolatilep);
4320   if (reversep)
4321     return false;
4322
4323   /* PR 107346.  Packed structs can have fields at offsets that are not
4324      multiples of BITS_PER_UNIT.  Do not use gather/scatters in such cases.  */
4325   if (!multiple_p (pbitpos, BITS_PER_UNIT))
4326     return false;
4327
4328   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4329
4330   if (TREE_CODE (base) == MEM_REF)
4331     {
4332       if (!integer_zerop (TREE_OPERAND (base, 1)))
4333         {
4334           if (off == NULL_TREE)
4335             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4336           else
4337             off = size_binop (PLUS_EXPR, off,
4338                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
4339         }
4340       base = TREE_OPERAND (base, 0);
4341     }
4342   else
4343     base = build_fold_addr_expr (base);
4344
4345   if (off == NULL_TREE)
4346     off = size_zero_node;
4347
4348   /* If base is not loop invariant, either off is 0, then we start with just
4349      the constant offset in the loop invariant BASE and continue with base
4350      as OFF, otherwise give up.
4351      We could handle that case by gimplifying the addition of base + off
4352      into some SSA_NAME and use that as off, but for now punt.  */
4353   if (!expr_invariant_in_loop_p (loop, base))
4354     {
4355       if (!integer_zerop (off))
4356         return false;
4357       off = base;
4358       base = size_int (pbytepos);
4359     }
4360   /* Otherwise put base + constant offset into the loop invariant BASE
4361      and continue with OFF.  */
4362   else
4363     {
4364       base = fold_convert (sizetype, base);
4365       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4366     }
4367
4368   /* OFF at this point may be either a SSA_NAME or some tree expression
4369      from get_inner_reference.  Try to peel off loop invariants from it
4370      into BASE as long as possible.  */
4371   STRIP_NOPS (off);
4372   while (offtype == NULL_TREE)
4373     {
4374       enum tree_code code;
4375       tree op0, op1, add = NULL_TREE;
4376
4377       if (TREE_CODE (off) == SSA_NAME)
4378         {
4379           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4380
4381           if (expr_invariant_in_loop_p (loop, off))
4382             return false;
4383
4384           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4385             break;
4386
4387           op0 = gimple_assign_rhs1 (def_stmt);
4388           code = gimple_assign_rhs_code (def_stmt);
4389           op1 = gimple_assign_rhs2 (def_stmt);
4390         }
4391       else
4392         {
4393           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4394             return false;
4395           code = TREE_CODE (off);
4396           extract_ops_from_tree (off, &code, &op0, &op1);
4397         }
4398       switch (code)
4399         {
4400         case POINTER_PLUS_EXPR:
4401         case PLUS_EXPR:
4402           if (expr_invariant_in_loop_p (loop, op0))
4403             {
4404               add = op0;
4405               off = op1;
4406             do_add:
4407               add = fold_convert (sizetype, add);
4408               if (scale != 1)
4409                 add = size_binop (MULT_EXPR, add, size_int (scale));
4410               base = size_binop (PLUS_EXPR, base, add);
4411               continue;
4412             }
4413           if (expr_invariant_in_loop_p (loop, op1))
4414             {
4415               add = op1;
4416               off = op0;
4417               goto do_add;
4418             }
4419           break;
4420         case MINUS_EXPR:
4421           if (expr_invariant_in_loop_p (loop, op1))
4422             {
4423               add = fold_convert (sizetype, op1);
4424               add = size_binop (MINUS_EXPR, size_zero_node, add);
4425               off = op0;
4426               goto do_add;
4427             }
4428           break;
4429         case MULT_EXPR:
4430           if (scale == 1 && tree_fits_shwi_p (op1))
4431             {
4432               int new_scale = tree_to_shwi (op1);
4433               /* Only treat this as a scaling operation if the target
4434                  supports it for at least some offset type.  */
4435               if (use_ifn_p
4436                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4437                                                 masked_p, vectype, memory_type,
4438                                                 signed_char_type_node,
4439                                                 new_scale, &ifn,
4440                                                 &offset_vectype)
4441                   && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4442                                                 masked_p, vectype, memory_type,
4443                                                 unsigned_char_type_node,
4444                                                 new_scale, &ifn,
4445                                                 &offset_vectype))
4446                 break;
4447               scale = new_scale;
4448               off = op0;
4449               continue;
4450             }
4451           break;
4452         case SSA_NAME:
4453           off = op0;
4454           continue;
4455         CASE_CONVERT:
4456           if (!POINTER_TYPE_P (TREE_TYPE (op0))
4457               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4458             break;
4459
4460           /* Don't include the conversion if the target is happy with
4461              the current offset type.  */
4462           if (use_ifn_p
4463               && TREE_CODE (off) == SSA_NAME
4464               && !POINTER_TYPE_P (TREE_TYPE (off))
4465               && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4466                                            masked_p, vectype, memory_type,
4467                                            TREE_TYPE (off), scale, &ifn,
4468                                            &offset_vectype))
4469             break;
4470
4471           if (TYPE_PRECISION (TREE_TYPE (op0))
4472               == TYPE_PRECISION (TREE_TYPE (off)))
4473             {
4474               off = op0;
4475               continue;
4476             }
4477
4478           /* Include the conversion if it is widening and we're using
4479              the IFN path or the target can handle the converted from
4480              offset or the current size is not already the same as the
4481              data vector element size.  */
4482           if ((TYPE_PRECISION (TREE_TYPE (op0))
4483                < TYPE_PRECISION (TREE_TYPE (off)))
4484               && (use_ifn_p
4485                   || (DR_IS_READ (dr)
4486                       ? (targetm.vectorize.builtin_gather
4487                          && targetm.vectorize.builtin_gather (vectype,
4488                                                               TREE_TYPE (op0),
4489                                                               scale))
4490                       : (targetm.vectorize.builtin_scatter
4491                          && targetm.vectorize.builtin_scatter (vectype,
4492                                                                TREE_TYPE (op0),
4493                                                                scale)))
4494                   || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4495                                        TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4496             {
4497               off = op0;
4498               offtype = TREE_TYPE (off);
4499               STRIP_NOPS (off);
4500               continue;
4501             }
4502           break;
4503         default:
4504           break;
4505         }
4506       break;
4507     }
4508
4509   /* If at the end OFF still isn't a SSA_NAME or isn't
4510      defined in the loop, punt.  */
4511   if (TREE_CODE (off) != SSA_NAME
4512       || expr_invariant_in_loop_p (loop, off))
4513     return false;
4514
4515   if (offtype == NULL_TREE)
4516     offtype = TREE_TYPE (off);
4517
4518   if (use_ifn_p)
4519     {
4520       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4521                                      vectype, memory_type, offtype, scale,
4522                                      &ifn, &offset_vectype))
4523         ifn = IFN_LAST;
4524       decl = NULL_TREE;
4525     }
4526   else
4527     {
4528       if (DR_IS_READ (dr))
4529         {
4530           if (targetm.vectorize.builtin_gather)
4531             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4532         }
4533       else
4534         {
4535           if (targetm.vectorize.builtin_scatter)
4536             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4537         }
4538       ifn = IFN_LAST;
4539       /* The offset vector type will be read from DECL when needed.  */
4540       offset_vectype = NULL_TREE;
4541     }
4542
4543   info->ifn = ifn;
4544   info->decl = decl;
4545   info->base = base;
4546   info->offset = off;
4547   info->offset_dt = vect_unknown_def_type;
4548   info->offset_vectype = offset_vectype;
4549   info->scale = scale;
4550   info->element_type = TREE_TYPE (vectype);
4551   info->memory_type = memory_type;
4552   return true;
4553 }
4554
4555 /* Find the data references in STMT, analyze them with respect to LOOP and
4556    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4557    be handled.  */
4558
4559 opt_result
4560 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4561                                vec<data_reference_p> *datarefs,
4562                                vec<int> *dataref_groups, int group_id)
4563 {
4564   /* We can ignore clobbers for dataref analysis - they are removed during
4565      loop vectorization and BB vectorization checks dependences with a
4566      stmt walk.  */
4567   if (gimple_clobber_p (stmt))
4568     return opt_result::success ();
4569
4570   if (gimple_has_volatile_ops (stmt))
4571     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4572                                    stmt);
4573
4574   if (stmt_can_throw_internal (cfun, stmt))
4575     return opt_result::failure_at (stmt,
4576                                    "not vectorized:"
4577                                    " statement can throw an exception: %G",
4578                                    stmt);
4579
4580   auto_vec<data_reference_p, 2> refs;
4581   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4582   if (!res)
4583     return res;
4584
4585   if (refs.is_empty ())
4586     return opt_result::success ();
4587
4588   if (refs.length () > 1)
4589     {
4590       while (!refs.is_empty ())
4591         free_data_ref (refs.pop ());
4592       return opt_result::failure_at (stmt,
4593                                      "not vectorized: more than one "
4594                                      "data ref in stmt: %G", stmt);
4595     }
4596
4597   data_reference_p dr = refs.pop ();
4598   if (gcall *call = dyn_cast <gcall *> (stmt))
4599     if (!gimple_call_internal_p (call)
4600         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4601             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4602       {
4603         free_data_ref (dr);
4604         return opt_result::failure_at (stmt,
4605                                        "not vectorized: dr in a call %G", stmt);
4606       }
4607
4608   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4609       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4610     {
4611       free_data_ref (dr);
4612       return opt_result::failure_at (stmt,
4613                                      "not vectorized:"
4614                                      " statement is an unsupported"
4615                                      " bitfield access %G", stmt);
4616     }
4617
4618   if (DR_BASE_ADDRESS (dr)
4619       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4620     {
4621       free_data_ref (dr);
4622       return opt_result::failure_at (stmt,
4623                                      "not vectorized:"
4624                                      " base addr of dr is a constant\n");
4625     }
4626
4627   /* Check whether this may be a SIMD lane access and adjust the
4628      DR to make it easier for us to handle it.  */
4629   if (loop
4630       && loop->simduid
4631       && (!DR_BASE_ADDRESS (dr)
4632           || !DR_OFFSET (dr)
4633           || !DR_INIT (dr)
4634           || !DR_STEP (dr)))
4635     {
4636       struct data_reference *newdr
4637         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4638                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4639       if (DR_BASE_ADDRESS (newdr)
4640           && DR_OFFSET (newdr)
4641           && DR_INIT (newdr)
4642           && DR_STEP (newdr)
4643           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4644           && integer_zerop (DR_STEP (newdr)))
4645         {
4646           tree base_address = DR_BASE_ADDRESS (newdr);
4647           tree off = DR_OFFSET (newdr);
4648           tree step = ssize_int (1);
4649           if (integer_zerop (off)
4650               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4651             {
4652               off = TREE_OPERAND (base_address, 1);
4653               base_address = TREE_OPERAND (base_address, 0);
4654             }
4655           STRIP_NOPS (off);
4656           if (TREE_CODE (off) == MULT_EXPR
4657               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4658             {
4659               step = TREE_OPERAND (off, 1);
4660               off = TREE_OPERAND (off, 0);
4661               STRIP_NOPS (off);
4662             }
4663           if (CONVERT_EXPR_P (off)
4664               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4665                   < TYPE_PRECISION (TREE_TYPE (off))))
4666             off = TREE_OPERAND (off, 0);
4667           if (TREE_CODE (off) == SSA_NAME)
4668             {
4669               gimple *def = SSA_NAME_DEF_STMT (off);
4670               /* Look through widening conversion.  */
4671               if (is_gimple_assign (def)
4672                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4673                 {
4674                   tree rhs1 = gimple_assign_rhs1 (def);
4675                   if (TREE_CODE (rhs1) == SSA_NAME
4676                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4677                       && (TYPE_PRECISION (TREE_TYPE (off))
4678                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4679                     def = SSA_NAME_DEF_STMT (rhs1);
4680                 }
4681               if (is_gimple_call (def)
4682                   && gimple_call_internal_p (def)
4683                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4684                 {
4685                   tree arg = gimple_call_arg (def, 0);
4686                   tree reft = TREE_TYPE (DR_REF (newdr));
4687                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4688                   arg = SSA_NAME_VAR (arg);
4689                   if (arg == loop->simduid
4690                       /* For now.  */
4691                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4692                     {
4693                       DR_BASE_ADDRESS (newdr) = base_address;
4694                       DR_OFFSET (newdr) = ssize_int (0);
4695                       DR_STEP (newdr) = step;
4696                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4697                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4698                       /* Mark as simd-lane access.  */
4699                       tree arg2 = gimple_call_arg (def, 1);
4700                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4701                       free_data_ref (dr);
4702                       datarefs->safe_push (newdr);
4703                       if (dataref_groups)
4704                         dataref_groups->safe_push (group_id);
4705                       return opt_result::success ();
4706                     }
4707                 }
4708             }
4709         }
4710       free_data_ref (newdr);
4711     }
4712
4713   datarefs->safe_push (dr);
4714   if (dataref_groups)
4715     dataref_groups->safe_push (group_id);
4716   return opt_result::success ();
4717 }
4718
4719 /* Function vect_analyze_data_refs.
4720
4721   Find all the data references in the loop or basic block.
4722
4723    The general structure of the analysis of data refs in the vectorizer is as
4724    follows:
4725    1- vect_analyze_data_refs(loop/bb): call
4726       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4727       in the loop/bb and their dependences.
4728    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4729    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4730    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4731
4732 */
4733
4734 opt_result
4735 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4736 {
4737   class loop *loop = NULL;
4738   unsigned int i;
4739   struct data_reference *dr;
4740   tree scalar_type;
4741
4742   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4743
4744   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4745     loop = LOOP_VINFO_LOOP (loop_vinfo);
4746
4747   /* Go through the data-refs, check that the analysis succeeded.  Update
4748      pointer from stmt_vec_info struct to DR and vectype.  */
4749
4750   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4751   FOR_EACH_VEC_ELT (datarefs, i, dr)
4752     {
4753       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4754       poly_uint64 vf;
4755
4756       gcc_assert (DR_REF (dr));
4757       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4758       gcc_assert (!stmt_info->dr_aux.dr);
4759       stmt_info->dr_aux.dr = dr;
4760       stmt_info->dr_aux.stmt = stmt_info;
4761
4762       /* Check that analysis of the data-ref succeeded.  */
4763       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4764           || !DR_STEP (dr))
4765         {
4766           bool maybe_gather
4767             = DR_IS_READ (dr)
4768               && !TREE_THIS_VOLATILE (DR_REF (dr));
4769           bool maybe_scatter
4770             = DR_IS_WRITE (dr)
4771               && !TREE_THIS_VOLATILE (DR_REF (dr));
4772
4773           /* If target supports vector gather loads or scatter stores,
4774              see if they can't be used.  */
4775           if (is_a <loop_vec_info> (vinfo)
4776               && !nested_in_vect_loop_p (loop, stmt_info))
4777             {
4778               if (maybe_gather || maybe_scatter)
4779                 {
4780                   if (maybe_gather)
4781                     gatherscatter = GATHER;
4782                   else
4783                     gatherscatter = SCATTER;
4784                 }
4785             }
4786
4787           if (gatherscatter == SG_NONE)
4788             {
4789               if (dump_enabled_p ())
4790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4791                                  "not vectorized: data ref analysis "
4792                                  "failed %G", stmt_info->stmt);
4793               if (is_a <bb_vec_info> (vinfo))
4794                 {
4795                   /* In BB vectorization the ref can still participate
4796                      in dependence analysis, we just can't vectorize it.  */
4797                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4798                   continue;
4799                 }
4800               return opt_result::failure_at (stmt_info->stmt,
4801                                              "not vectorized:"
4802                                              " data ref analysis failed: %G",
4803                                              stmt_info->stmt);
4804             }
4805         }
4806
4807       /* See if this was detected as SIMD lane access.  */
4808       if (dr->aux == (void *)-1
4809           || dr->aux == (void *)-2
4810           || dr->aux == (void *)-3
4811           || dr->aux == (void *)-4)
4812         {
4813           if (nested_in_vect_loop_p (loop, stmt_info))
4814             return opt_result::failure_at (stmt_info->stmt,
4815                                            "not vectorized:"
4816                                            " data ref analysis failed: %G",
4817                                            stmt_info->stmt);
4818           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4819             = -(uintptr_t) dr->aux;
4820         }
4821
4822       tree base = get_base_address (DR_REF (dr));
4823       if (base && VAR_P (base) && DECL_NONALIASED (base))
4824         {
4825           if (dump_enabled_p ())
4826             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4827                              "not vectorized: base object not addressable "
4828                              "for stmt: %G", stmt_info->stmt);
4829           if (is_a <bb_vec_info> (vinfo))
4830             {
4831               /* In BB vectorization the ref can still participate
4832                  in dependence analysis, we just can't vectorize it.  */
4833               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4834               continue;
4835             }
4836           return opt_result::failure_at (stmt_info->stmt,
4837                                          "not vectorized: base object not"
4838                                          " addressable for stmt: %G",
4839                                          stmt_info->stmt);
4840         }
4841
4842       if (is_a <loop_vec_info> (vinfo)
4843           && DR_STEP (dr)
4844           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4845         {
4846           if (nested_in_vect_loop_p (loop, stmt_info))
4847             return opt_result::failure_at (stmt_info->stmt,
4848                                            "not vectorized: "
4849                                            "not suitable for strided load %G",
4850                                            stmt_info->stmt);
4851           STMT_VINFO_STRIDED_P (stmt_info) = true;
4852         }
4853
4854       /* Update DR field in stmt_vec_info struct.  */
4855
4856       /* If the dataref is in an inner-loop of the loop that is considered for
4857          for vectorization, we also want to analyze the access relative to
4858          the outer-loop (DR contains information only relative to the
4859          inner-most enclosing loop).  We do that by building a reference to the
4860          first location accessed by the inner-loop, and analyze it relative to
4861          the outer-loop.  */
4862       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4863         {
4864           /* Build a reference to the first location accessed by the
4865              inner loop: *(BASE + INIT + OFFSET).  By construction,
4866              this address must be invariant in the inner loop, so we
4867              can consider it as being used in the outer loop.  */
4868           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4869           tree offset = unshare_expr (DR_OFFSET (dr));
4870           tree init = unshare_expr (DR_INIT (dr));
4871           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4872                                           init, offset);
4873           tree init_addr = fold_build_pointer_plus (base, init_offset);
4874           tree init_ref = build_fold_indirect_ref (init_addr);
4875
4876           if (dump_enabled_p ())
4877             dump_printf_loc (MSG_NOTE, vect_location,
4878                              "analyze in outer loop: %T\n", init_ref);
4879
4880           opt_result res
4881             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4882                                     init_ref, loop, stmt_info->stmt);
4883           if (!res)
4884             /* dr_analyze_innermost already explained the failure.  */
4885             return res;
4886
4887           if (dump_enabled_p ())
4888             dump_printf_loc (MSG_NOTE, vect_location,
4889                              "\touter base_address: %T\n"
4890                              "\touter offset from base address: %T\n"
4891                              "\touter constant offset from base address: %T\n"
4892                              "\touter step: %T\n"
4893                              "\touter base alignment: %d\n\n"
4894                              "\touter base misalignment: %d\n"
4895                              "\touter offset alignment: %d\n"
4896                              "\touter step alignment: %d\n",
4897                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4898                              STMT_VINFO_DR_OFFSET (stmt_info),
4899                              STMT_VINFO_DR_INIT (stmt_info),
4900                              STMT_VINFO_DR_STEP (stmt_info),
4901                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4902                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4903                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4904                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4905         }
4906
4907       /* Set vectype for STMT.  */
4908       scalar_type = TREE_TYPE (DR_REF (dr));
4909       tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4910       if (!vectype)
4911         {
4912           if (dump_enabled_p ())
4913             {
4914               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4915                                "not vectorized: no vectype for stmt: %G",
4916                                stmt_info->stmt);
4917               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4918               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4919                                  scalar_type);
4920               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4921             }
4922
4923           if (is_a <bb_vec_info> (vinfo))
4924             {
4925               /* No vector type is fine, the ref can still participate
4926                  in dependence analysis, we just can't vectorize it.  */
4927               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4928               continue;
4929             }
4930           if (fatal)
4931             *fatal = false;
4932           return opt_result::failure_at (stmt_info->stmt,
4933                                          "not vectorized:"
4934                                          " no vectype for stmt: %G"
4935                                          " scalar_type: %T\n",
4936                                          stmt_info->stmt, scalar_type);
4937         }
4938       else
4939         {
4940           if (dump_enabled_p ())
4941             dump_printf_loc (MSG_NOTE, vect_location,
4942                              "got vectype for stmt: %G%T\n",
4943                              stmt_info->stmt, vectype);
4944         }
4945
4946       /* Adjust the minimal vectorization factor according to the
4947          vector type.  */
4948       vf = TYPE_VECTOR_SUBPARTS (vectype);
4949       *min_vf = upper_bound (*min_vf, vf);
4950
4951       /* Leave the BB vectorizer to pick the vector type later, based on
4952          the final dataref group size and SLP node size.  */
4953       if (is_a <loop_vec_info> (vinfo))
4954         STMT_VINFO_VECTYPE (stmt_info) = vectype;
4955
4956       if (gatherscatter != SG_NONE)
4957         {
4958           gather_scatter_info gs_info;
4959           if (!vect_check_gather_scatter (stmt_info,
4960                                           as_a <loop_vec_info> (vinfo),
4961                                           &gs_info)
4962               || !get_vectype_for_scalar_type (vinfo,
4963                                                TREE_TYPE (gs_info.offset)))
4964             {
4965               if (fatal)
4966                 *fatal = false;
4967               return opt_result::failure_at
4968                         (stmt_info->stmt,
4969                          (gatherscatter == GATHER)
4970                          ? "not vectorized: not suitable for gather load %G"
4971                          : "not vectorized: not suitable for scatter store %G",
4972                          stmt_info->stmt);
4973             }
4974           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4975         }
4976     }
4977
4978   /* We used to stop processing and prune the list here.  Verify we no
4979      longer need to.  */
4980   gcc_assert (i == datarefs.length ());
4981
4982   return opt_result::success ();
4983 }
4984
4985
4986 /* Function vect_get_new_vect_var.
4987
4988    Returns a name for a new variable.  The current naming scheme appends the
4989    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4990    the name of vectorizer generated variables, and appends that to NAME if
4991    provided.  */
4992
4993 tree
4994 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4995 {
4996   const char *prefix;
4997   tree new_vect_var;
4998
4999   switch (var_kind)
5000   {
5001   case vect_simple_var:
5002     prefix = "vect";
5003     break;
5004   case vect_scalar_var:
5005     prefix = "stmp";
5006     break;
5007   case vect_mask_var:
5008     prefix = "mask";
5009     break;
5010   case vect_pointer_var:
5011     prefix = "vectp";
5012     break;
5013   default:
5014     gcc_unreachable ();
5015   }
5016
5017   if (name)
5018     {
5019       char* tmp = concat (prefix, "_", name, NULL);
5020       new_vect_var = create_tmp_reg (type, tmp);
5021       free (tmp);
5022     }
5023   else
5024     new_vect_var = create_tmp_reg (type, prefix);
5025
5026   return new_vect_var;
5027 }
5028
5029 /* Like vect_get_new_vect_var but return an SSA name.  */
5030
5031 tree
5032 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5033 {
5034   const char *prefix;
5035   tree new_vect_var;
5036
5037   switch (var_kind)
5038   {
5039   case vect_simple_var:
5040     prefix = "vect";
5041     break;
5042   case vect_scalar_var:
5043     prefix = "stmp";
5044     break;
5045   case vect_pointer_var:
5046     prefix = "vectp";
5047     break;
5048   default:
5049     gcc_unreachable ();
5050   }
5051
5052   if (name)
5053     {
5054       char* tmp = concat (prefix, "_", name, NULL);
5055       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5056       free (tmp);
5057     }
5058   else
5059     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5060
5061   return new_vect_var;
5062 }
5063
5064 /* Duplicate points-to info on NAME from DR_INFO.  */
5065
5066 static void
5067 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5068 {
5069   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5070   /* DR_PTR_INFO is for a base SSA name, not including constant or
5071      variable offsets in the ref so its alignment info does not apply.  */
5072   mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5073 }
5074
5075 /* Function vect_create_addr_base_for_vector_ref.
5076
5077    Create an expression that computes the address of the first memory location
5078    that will be accessed for a data reference.
5079
5080    Input:
5081    STMT_INFO: The statement containing the data reference.
5082    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5083    OFFSET: Optional. If supplied, it is be added to the initial address.
5084    LOOP:    Specify relative to which loop-nest should the address be computed.
5085             For example, when the dataref is in an inner-loop nested in an
5086             outer-loop that is now being vectorized, LOOP can be either the
5087             outer-loop, or the inner-loop.  The first memory location accessed
5088             by the following dataref ('in' points to short):
5089
5090                 for (i=0; i<N; i++)
5091                    for (j=0; j<M; j++)
5092                      s += in[i+j]
5093
5094             is as follows:
5095             if LOOP=i_loop:     &in             (relative to i_loop)
5096             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
5097
5098    Output:
5099    1. Return an SSA_NAME whose value is the address of the memory location of
5100       the first vector of the data reference.
5101    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5102       these statement(s) which define the returned SSA_NAME.
5103
5104    FORNOW: We are only handling array accesses with step 1.  */
5105
5106 tree
5107 vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5108                                       gimple_seq *new_stmt_list,
5109                                       tree offset)
5110 {
5111   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5112   struct data_reference *dr = dr_info->dr;
5113   const char *base_name;
5114   tree addr_base;
5115   tree dest;
5116   gimple_seq seq = NULL;
5117   tree vect_ptr_type;
5118   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5119   innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5120
5121   tree data_ref_base = unshare_expr (drb->base_address);
5122   tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5123   tree init = unshare_expr (drb->init);
5124
5125   if (loop_vinfo)
5126     base_name = get_name (data_ref_base);
5127   else
5128     {
5129       base_offset = ssize_int (0);
5130       init = ssize_int (0);
5131       base_name = get_name (DR_REF (dr));
5132     }
5133
5134   /* Create base_offset */
5135   base_offset = size_binop (PLUS_EXPR,
5136                             fold_convert (sizetype, base_offset),
5137                             fold_convert (sizetype, init));
5138
5139   if (offset)
5140     {
5141       offset = fold_convert (sizetype, offset);
5142       base_offset = fold_build2 (PLUS_EXPR, sizetype,
5143                                  base_offset, offset);
5144     }
5145
5146   /* base + base_offset */
5147   if (loop_vinfo)
5148     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5149   else
5150     addr_base = build1 (ADDR_EXPR,
5151                         build_pointer_type (TREE_TYPE (DR_REF (dr))),
5152                         /* Strip zero offset components since we don't need
5153                            them and they can confuse late diagnostics if
5154                            we CSE them wrongly.  See PR106904 for example.  */
5155                         unshare_expr (strip_zero_offset_components
5156                                                                 (DR_REF (dr))));
5157
5158   vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5159   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5160   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5161   gimple_seq_add_seq (new_stmt_list, seq);
5162
5163   if (DR_PTR_INFO (dr)
5164       && TREE_CODE (addr_base) == SSA_NAME
5165       /* We should only duplicate pointer info to newly created SSA names.  */
5166       && SSA_NAME_VAR (addr_base) == dest)
5167     {
5168       gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5169       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5170     }
5171
5172   if (dump_enabled_p ())
5173     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5174
5175   return addr_base;
5176 }
5177
5178
5179 /* Function vect_create_data_ref_ptr.
5180
5181    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5182    location accessed in the loop by STMT_INFO, along with the def-use update
5183    chain to appropriately advance the pointer through the loop iterations.
5184    Also set aliasing information for the pointer.  This pointer is used by
5185    the callers to this function to create a memory reference expression for
5186    vector load/store access.
5187
5188    Input:
5189    1. STMT_INFO: a stmt that references memory. Expected to be of the form
5190          GIMPLE_ASSIGN <name, data-ref> or
5191          GIMPLE_ASSIGN <data-ref, name>.
5192    2. AGGR_TYPE: the type of the reference, which should be either a vector
5193         or an array.
5194    3. AT_LOOP: the loop where the vector memref is to be created.
5195    4. OFFSET (optional): a byte offset to be added to the initial address
5196         accessed by the data-ref in STMT_INFO.
5197    5. BSI: location where the new stmts are to be placed if there is no loop
5198    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5199         pointing to the initial address.
5200    8. IV_STEP (optional, defaults to NULL): the amount that should be added
5201         to the IV during each iteration of the loop.  NULL says to move
5202         by one copy of AGGR_TYPE up or down, depending on the step of the
5203         data reference.
5204
5205    Output:
5206    1. Declare a new ptr to vector_type, and have it point to the base of the
5207       data reference (initial addressed accessed by the data reference).
5208       For example, for vector of type V8HI, the following code is generated:
5209
5210       v8hi *ap;
5211       ap = (v8hi *)initial_address;
5212
5213       if OFFSET is not supplied:
5214          initial_address = &a[init];
5215       if OFFSET is supplied:
5216          initial_address = &a[init] + OFFSET;
5217       if BYTE_OFFSET is supplied:
5218          initial_address = &a[init] + BYTE_OFFSET;
5219
5220       Return the initial_address in INITIAL_ADDRESS.
5221
5222    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
5223       update the pointer in each iteration of the loop.
5224
5225       Return the increment stmt that updates the pointer in PTR_INCR.
5226
5227    3. Return the pointer.  */
5228
5229 tree
5230 vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5231                           tree aggr_type, class loop *at_loop, tree offset,
5232                           tree *initial_address, gimple_stmt_iterator *gsi,
5233                           gimple **ptr_incr, bool only_init,
5234                           tree iv_step)
5235 {
5236   const char *base_name;
5237   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5238   class loop *loop = NULL;
5239   bool nested_in_vect_loop = false;
5240   class loop *containing_loop = NULL;
5241   tree aggr_ptr_type;
5242   tree aggr_ptr;
5243   tree new_temp;
5244   gimple_seq new_stmt_list = NULL;
5245   edge pe = NULL;
5246   basic_block new_bb;
5247   tree aggr_ptr_init;
5248   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5249   struct data_reference *dr = dr_info->dr;
5250   tree aptr;
5251   gimple_stmt_iterator incr_gsi;
5252   bool insert_after;
5253   tree indx_before_incr, indx_after_incr;
5254   gimple *incr;
5255   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5256
5257   gcc_assert (iv_step != NULL_TREE
5258               || TREE_CODE (aggr_type) == ARRAY_TYPE
5259               || TREE_CODE (aggr_type) == VECTOR_TYPE);
5260
5261   if (loop_vinfo)
5262     {
5263       loop = LOOP_VINFO_LOOP (loop_vinfo);
5264       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5265       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5266       pe = loop_preheader_edge (loop);
5267     }
5268   else
5269     {
5270       gcc_assert (bb_vinfo);
5271       only_init = true;
5272       *ptr_incr = NULL;
5273     }
5274
5275   /* Create an expression for the first address accessed by this load
5276      in LOOP.  */
5277   base_name = get_name (DR_BASE_ADDRESS (dr));
5278
5279   if (dump_enabled_p ())
5280     {
5281       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5282       dump_printf_loc (MSG_NOTE, vect_location,
5283                        "create %s-pointer variable to type: %T",
5284                        get_tree_code_name (TREE_CODE (aggr_type)),
5285                        aggr_type);
5286       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5287         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
5288       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5289         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
5290       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5291         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
5292       else
5293         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
5294       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5295     }
5296
5297   /* (1) Create the new aggregate-pointer variable.
5298      Vector and array types inherit the alias set of their component
5299      type by default so we need to use a ref-all pointer if the data
5300      reference does not conflict with the created aggregated data
5301      reference because it is not addressable.  */
5302   bool need_ref_all = false;
5303   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5304                               get_alias_set (DR_REF (dr))))
5305     need_ref_all = true;
5306   /* Likewise for any of the data references in the stmt group.  */
5307   else if (DR_GROUP_SIZE (stmt_info) > 1)
5308     {
5309       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5310       do
5311         {
5312           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5313           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5314                                       get_alias_set (DR_REF (sdr))))
5315             {
5316               need_ref_all = true;
5317               break;
5318             }
5319           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5320         }
5321       while (sinfo);
5322     }
5323   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
5324                                                need_ref_all);
5325   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5326
5327
5328   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5329      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5330      def-use update cycles for the pointer: one relative to the outer-loop
5331      (LOOP), which is what steps (3) and (4) below do.  The other is relative
5332      to the inner-loop (which is the inner-most loop containing the dataref),
5333      and this is done be step (5) below.
5334
5335      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5336      inner-most loop, and so steps (3),(4) work the same, and step (5) is
5337      redundant.  Steps (3),(4) create the following:
5338
5339         vp0 = &base_addr;
5340         LOOP:   vp1 = phi(vp0,vp2)
5341                 ...
5342                 ...
5343                 vp2 = vp1 + step
5344                 goto LOOP
5345
5346      If there is an inner-loop nested in loop, then step (5) will also be
5347      applied, and an additional update in the inner-loop will be created:
5348
5349         vp0 = &base_addr;
5350         LOOP:   vp1 = phi(vp0,vp2)
5351                 ...
5352         inner:     vp3 = phi(vp1,vp4)
5353                    vp4 = vp3 + inner_step
5354                    if () goto inner
5355                 ...
5356                 vp2 = vp1 + step
5357                 if () goto LOOP   */
5358
5359   /* (2) Calculate the initial address of the aggregate-pointer, and set
5360      the aggregate-pointer to point to it before the loop.  */
5361
5362   /* Create: (&(base[init_val]+offset) in the loop preheader.  */
5363
5364   new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5365                                                    stmt_info, &new_stmt_list,
5366                                                    offset);
5367   if (new_stmt_list)
5368     {
5369       if (pe)
5370         {
5371           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5372           gcc_assert (!new_bb);
5373         }
5374       else
5375         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5376     }
5377
5378   *initial_address = new_temp;
5379   aggr_ptr_init = new_temp;
5380
5381   /* (3) Handle the updating of the aggregate-pointer inside the loop.
5382      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5383      inner-loop nested in LOOP (during outer-loop vectorization).  */
5384
5385   /* No update in loop is required.  */
5386   if (only_init && (!loop_vinfo || at_loop == loop))
5387     aptr = aggr_ptr_init;
5388   else
5389     {
5390       /* Accesses to invariant addresses should be handled specially
5391          by the caller.  */
5392       tree step = vect_dr_behavior (vinfo, dr_info)->step;
5393       gcc_assert (!integer_zerop (step));
5394
5395       if (iv_step == NULL_TREE)
5396         {
5397           /* The step of the aggregate pointer is the type size,
5398              negated for downward accesses.  */
5399           iv_step = TYPE_SIZE_UNIT (aggr_type);
5400           if (tree_int_cst_sgn (step) == -1)
5401             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5402         }
5403
5404       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5405
5406       create_iv (aggr_ptr_init, PLUS_EXPR,
5407                  fold_convert (aggr_ptr_type, iv_step),
5408                  aggr_ptr, loop, &incr_gsi, insert_after,
5409                  &indx_before_incr, &indx_after_incr);
5410       incr = gsi_stmt (incr_gsi);
5411
5412       /* Copy the points-to information if it exists. */
5413       if (DR_PTR_INFO (dr))
5414         {
5415           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5416           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5417         }
5418       if (ptr_incr)
5419         *ptr_incr = incr;
5420
5421       aptr = indx_before_incr;
5422     }
5423
5424   if (!nested_in_vect_loop || only_init)
5425     return aptr;
5426
5427
5428   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5429      nested in LOOP, if exists.  */
5430
5431   gcc_assert (nested_in_vect_loop);
5432   if (!only_init)
5433     {
5434       standard_iv_increment_position (containing_loop, &incr_gsi,
5435                                       &insert_after);
5436       create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5437                  aggr_ptr, containing_loop, &incr_gsi, insert_after,
5438                  &indx_before_incr, &indx_after_incr);
5439       incr = gsi_stmt (incr_gsi);
5440
5441       /* Copy the points-to information if it exists. */
5442       if (DR_PTR_INFO (dr))
5443         {
5444           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5445           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5446         }
5447       if (ptr_incr)
5448         *ptr_incr = incr;
5449
5450       return indx_before_incr;
5451     }
5452   else
5453     gcc_unreachable ();
5454 }
5455
5456
5457 /* Function bump_vector_ptr
5458
5459    Increment a pointer (to a vector type) by vector-size. If requested,
5460    i.e. if PTR-INCR is given, then also connect the new increment stmt
5461    to the existing def-use update-chain of the pointer, by modifying
5462    the PTR_INCR as illustrated below:
5463
5464    The pointer def-use update-chain before this function:
5465                         DATAREF_PTR = phi (p_0, p_2)
5466                         ....
5467         PTR_INCR:       p_2 = DATAREF_PTR + step
5468
5469    The pointer def-use update-chain after this function:
5470                         DATAREF_PTR = phi (p_0, p_2)
5471                         ....
5472                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5473                         ....
5474         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
5475
5476    Input:
5477    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5478                  in the loop.
5479    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5480               the loop.  The increment amount across iterations is expected
5481               to be vector_size.
5482    BSI - location where the new update stmt is to be placed.
5483    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5484    BUMP - optional. The offset by which to bump the pointer. If not given,
5485           the offset is assumed to be vector_size.
5486
5487    Output: Return NEW_DATAREF_PTR as illustrated above.
5488
5489 */
5490
5491 tree
5492 bump_vector_ptr (vec_info *vinfo,
5493                  tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5494                  stmt_vec_info stmt_info, tree bump)
5495 {
5496   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5497   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5498   tree update = TYPE_SIZE_UNIT (vectype);
5499   gimple *incr_stmt;
5500   ssa_op_iter iter;
5501   use_operand_p use_p;
5502   tree new_dataref_ptr;
5503
5504   if (bump)
5505     update = bump;
5506
5507   if (TREE_CODE (dataref_ptr) == SSA_NAME)
5508     new_dataref_ptr = copy_ssa_name (dataref_ptr);
5509   else if (is_gimple_min_invariant (dataref_ptr))
5510     /* When possible avoid emitting a separate increment stmt that will
5511        force the addressed object addressable.  */
5512     return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5513                    fold_build2 (MEM_REF,
5514                                 TREE_TYPE (TREE_TYPE (dataref_ptr)),
5515                                 dataref_ptr,
5516                                 fold_convert (ptr_type_node, update)));
5517   else
5518     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5519   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5520                                    dataref_ptr, update);
5521   vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5522   /* Fold the increment, avoiding excessive chains use-def chains of
5523      those, leading to compile-time issues for passes until the next
5524      forwprop pass which would do this as well.  */
5525   gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5526   if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5527     {
5528       incr_stmt = gsi_stmt (fold_gsi);
5529       update_stmt (incr_stmt);
5530     }
5531
5532   /* Copy the points-to information if it exists. */
5533   if (DR_PTR_INFO (dr))
5534     {
5535       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5536       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5537     }
5538
5539   if (!ptr_incr)
5540     return new_dataref_ptr;
5541
5542   /* Update the vector-pointer's cross-iteration increment.  */
5543   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5544     {
5545       tree use = USE_FROM_PTR (use_p);
5546
5547       if (use == dataref_ptr)
5548         SET_USE (use_p, new_dataref_ptr);
5549       else
5550         gcc_assert (operand_equal_p (use, update, 0));
5551     }
5552
5553   return new_dataref_ptr;
5554 }
5555
5556
5557 /* Copy memory reference info such as base/clique from the SRC reference
5558    to the DEST MEM_REF.  */
5559
5560 void
5561 vect_copy_ref_info (tree dest, tree src)
5562 {
5563   if (TREE_CODE (dest) != MEM_REF)
5564     return;
5565
5566   tree src_base = src;
5567   while (handled_component_p (src_base))
5568     src_base = TREE_OPERAND (src_base, 0);
5569   if (TREE_CODE (src_base) != MEM_REF
5570       && TREE_CODE (src_base) != TARGET_MEM_REF)
5571     return;
5572
5573   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5574   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5575 }
5576
5577
5578 /* Function vect_create_destination_var.
5579
5580    Create a new temporary of type VECTYPE.  */
5581
5582 tree
5583 vect_create_destination_var (tree scalar_dest, tree vectype)
5584 {
5585   tree vec_dest;
5586   const char *name;
5587   char *new_name;
5588   tree type;
5589   enum vect_var_kind kind;
5590
5591   kind = vectype
5592     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5593     ? vect_mask_var
5594     : vect_simple_var
5595     : vect_scalar_var;
5596   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5597
5598   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5599
5600   name = get_name (scalar_dest);
5601   if (name)
5602     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5603   else
5604     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5605   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5606   free (new_name);
5607
5608   return vec_dest;
5609 }
5610
5611 /* Function vect_grouped_store_supported.
5612
5613    Returns TRUE if interleave high and interleave low permutations
5614    are supported, and FALSE otherwise.  */
5615
5616 bool
5617 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5618 {
5619   machine_mode mode = TYPE_MODE (vectype);
5620
5621   /* vect_permute_store_chain requires the group size to be equal to 3 or
5622      be a power of two.  */
5623   if (count != 3 && exact_log2 (count) == -1)
5624     {
5625       if (dump_enabled_p ())
5626         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5627                          "the size of the group of accesses"
5628                          " is not a power of 2 or not eqaul to 3\n");
5629       return false;
5630     }
5631
5632   /* Check that the permutation is supported.  */
5633   if (VECTOR_MODE_P (mode))
5634     {
5635       unsigned int i;
5636       if (count == 3)
5637         {
5638           unsigned int j0 = 0, j1 = 0, j2 = 0;
5639           unsigned int i, j;
5640
5641           unsigned int nelt;
5642           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5643             {
5644               if (dump_enabled_p ())
5645                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5646                                  "cannot handle groups of 3 stores for"
5647                                  " variable-length vectors\n");
5648               return false;
5649             }
5650
5651           vec_perm_builder sel (nelt, nelt, 1);
5652           sel.quick_grow (nelt);
5653           vec_perm_indices indices;
5654           for (j = 0; j < 3; j++)
5655             {
5656               int nelt0 = ((3 - j) * nelt) % 3;
5657               int nelt1 = ((3 - j) * nelt + 1) % 3;
5658               int nelt2 = ((3 - j) * nelt + 2) % 3;
5659               for (i = 0; i < nelt; i++)
5660                 {
5661                   if (3 * i + nelt0 < nelt)
5662                     sel[3 * i + nelt0] = j0++;
5663                   if (3 * i + nelt1 < nelt)
5664                     sel[3 * i + nelt1] = nelt + j1++;
5665                   if (3 * i + nelt2 < nelt)
5666                     sel[3 * i + nelt2] = 0;
5667                 }
5668               indices.new_vector (sel, 2, nelt);
5669               if (!can_vec_perm_const_p (mode, mode, indices))
5670                 {
5671                   if (dump_enabled_p ())
5672                     dump_printf (MSG_MISSED_OPTIMIZATION,
5673                                  "permutation op not supported by target.\n");
5674                   return false;
5675                 }
5676
5677               for (i = 0; i < nelt; i++)
5678                 {
5679                   if (3 * i + nelt0 < nelt)
5680                     sel[3 * i + nelt0] = 3 * i + nelt0;
5681                   if (3 * i + nelt1 < nelt)
5682                     sel[3 * i + nelt1] = 3 * i + nelt1;
5683                   if (3 * i + nelt2 < nelt)
5684                     sel[3 * i + nelt2] = nelt + j2++;
5685                 }
5686               indices.new_vector (sel, 2, nelt);
5687               if (!can_vec_perm_const_p (mode, mode, indices))
5688                 {
5689                   if (dump_enabled_p ())
5690                     dump_printf (MSG_MISSED_OPTIMIZATION,
5691                                  "permutation op not supported by target.\n");
5692                   return false;
5693                 }
5694             }
5695           return true;
5696         }
5697       else
5698         {
5699           /* If length is not equal to 3 then only power of 2 is supported.  */
5700           gcc_assert (pow2p_hwi (count));
5701           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5702
5703           /* The encoding has 2 interleaved stepped patterns.  */
5704           if(!multiple_p (nelt, 2))
5705             return false;
5706           vec_perm_builder sel (nelt, 2, 3);
5707           sel.quick_grow (6);
5708           for (i = 0; i < 3; i++)
5709             {
5710               sel[i * 2] = i;
5711               sel[i * 2 + 1] = i + nelt;
5712             }
5713           vec_perm_indices indices (sel, 2, nelt);
5714           if (can_vec_perm_const_p (mode, mode, indices))
5715             {
5716               for (i = 0; i < 6; i++)
5717                 sel[i] += exact_div (nelt, 2);
5718               indices.new_vector (sel, 2, nelt);
5719               if (can_vec_perm_const_p (mode, mode, indices))
5720                 return true;
5721             }
5722         }
5723     }
5724
5725   if (dump_enabled_p ())
5726     dump_printf (MSG_MISSED_OPTIMIZATION,
5727                  "permutation op not supported by target.\n");
5728   return false;
5729 }
5730
5731 /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5732    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5733
5734 internal_fn
5735 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5736                             bool masked_p)
5737 {
5738   if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5739                                     vec_mask_len_store_lanes_optab, vectype,
5740                                     count))
5741     return IFN_MASK_LEN_STORE_LANES;
5742   else if (masked_p)
5743     {
5744       if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5745                                         vec_mask_store_lanes_optab, vectype,
5746                                         count))
5747         return IFN_MASK_STORE_LANES;
5748     }
5749   else
5750     {
5751       if (vect_lanes_optab_supported_p ("vec_store_lanes",
5752                                         vec_store_lanes_optab, vectype, count))
5753         return IFN_STORE_LANES;
5754     }
5755   return IFN_LAST;
5756 }
5757
5758
5759 /* Function vect_permute_store_chain.
5760
5761    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5762    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5763    the data correctly for the stores.  Return the final references for stores
5764    in RESULT_CHAIN.
5765
5766    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5767    The input is 4 vectors each containing 8 elements.  We assign a number to
5768    each element, the input sequence is:
5769
5770    1st vec:   0  1  2  3  4  5  6  7
5771    2nd vec:   8  9 10 11 12 13 14 15
5772    3rd vec:  16 17 18 19 20 21 22 23
5773    4th vec:  24 25 26 27 28 29 30 31
5774
5775    The output sequence should be:
5776
5777    1st vec:  0  8 16 24  1  9 17 25
5778    2nd vec:  2 10 18 26  3 11 19 27
5779    3rd vec:  4 12 20 28  5 13 21 30
5780    4th vec:  6 14 22 30  7 15 23 31
5781
5782    i.e., we interleave the contents of the four vectors in their order.
5783
5784    We use interleave_high/low instructions to create such output.  The input of
5785    each interleave_high/low operation is two vectors:
5786    1st vec    2nd vec
5787    0 1 2 3    4 5 6 7
5788    the even elements of the result vector are obtained left-to-right from the
5789    high/low elements of the first vector.  The odd elements of the result are
5790    obtained left-to-right from the high/low elements of the second vector.
5791    The output of interleave_high will be:   0 4 1 5
5792    and of interleave_low:                   2 6 3 7
5793
5794
5795    The permutation is done in log LENGTH stages.  In each stage interleave_high
5796    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5797    where the first argument is taken from the first half of DR_CHAIN and the
5798    second argument from it's second half.
5799    In our example,
5800
5801    I1: interleave_high (1st vec, 3rd vec)
5802    I2: interleave_low (1st vec, 3rd vec)
5803    I3: interleave_high (2nd vec, 4th vec)
5804    I4: interleave_low (2nd vec, 4th vec)
5805
5806    The output for the first stage is:
5807
5808    I1:  0 16  1 17  2 18  3 19
5809    I2:  4 20  5 21  6 22  7 23
5810    I3:  8 24  9 25 10 26 11 27
5811    I4: 12 28 13 29 14 30 15 31
5812
5813    The output of the second stage, i.e. the final result is:
5814
5815    I1:  0  8 16 24  1  9 17 25
5816    I2:  2 10 18 26  3 11 19 27
5817    I3:  4 12 20 28  5 13 21 30
5818    I4:  6 14 22 30  7 15 23 31.  */
5819
5820 void
5821 vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5822                           unsigned int length,
5823                           stmt_vec_info stmt_info,
5824                           gimple_stmt_iterator *gsi,
5825                           vec<tree> *result_chain)
5826 {
5827   tree vect1, vect2, high, low;
5828   gimple *perm_stmt;
5829   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5830   tree perm_mask_low, perm_mask_high;
5831   tree data_ref;
5832   tree perm3_mask_low, perm3_mask_high;
5833   unsigned int i, j, n, log_length = exact_log2 (length);
5834
5835   result_chain->quick_grow (length);
5836   memcpy (result_chain->address (), dr_chain.address (),
5837           length * sizeof (tree));
5838
5839   if (length == 3)
5840     {
5841       /* vect_grouped_store_supported ensures that this is constant.  */
5842       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5843       unsigned int j0 = 0, j1 = 0, j2 = 0;
5844
5845       vec_perm_builder sel (nelt, nelt, 1);
5846       sel.quick_grow (nelt);
5847       vec_perm_indices indices;
5848       for (j = 0; j < 3; j++)
5849         {
5850           int nelt0 = ((3 - j) * nelt) % 3;
5851           int nelt1 = ((3 - j) * nelt + 1) % 3;
5852           int nelt2 = ((3 - j) * nelt + 2) % 3;
5853
5854           for (i = 0; i < nelt; i++)
5855             {
5856               if (3 * i + nelt0 < nelt)
5857                 sel[3 * i + nelt0] = j0++;
5858               if (3 * i + nelt1 < nelt)
5859                 sel[3 * i + nelt1] = nelt + j1++;
5860               if (3 * i + nelt2 < nelt)
5861                 sel[3 * i + nelt2] = 0;
5862             }
5863           indices.new_vector (sel, 2, nelt);
5864           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5865
5866           for (i = 0; i < nelt; i++)
5867             {
5868               if (3 * i + nelt0 < nelt)
5869                 sel[3 * i + nelt0] = 3 * i + nelt0;
5870               if (3 * i + nelt1 < nelt)
5871                 sel[3 * i + nelt1] = 3 * i + nelt1;
5872               if (3 * i + nelt2 < nelt)
5873                 sel[3 * i + nelt2] = nelt + j2++;
5874             }
5875           indices.new_vector (sel, 2, nelt);
5876           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5877
5878           vect1 = dr_chain[0];
5879           vect2 = dr_chain[1];
5880
5881           /* Create interleaving stmt:
5882              low = VEC_PERM_EXPR <vect1, vect2,
5883                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5884                                    j + 2, nelt + j + 2, *, ...}>  */
5885           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5886           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5887                                            vect2, perm3_mask_low);
5888           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5889
5890           vect1 = data_ref;
5891           vect2 = dr_chain[2];
5892           /* Create interleaving stmt:
5893              low = VEC_PERM_EXPR <vect1, vect2,
5894                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5895                                    6, 7, nelt + j + 2, ...}>  */
5896           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5897           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5898                                            vect2, perm3_mask_high);
5899           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5900           (*result_chain)[j] = data_ref;
5901         }
5902     }
5903   else
5904     {
5905       /* If length is not equal to 3 then only power of 2 is supported.  */
5906       gcc_assert (pow2p_hwi (length));
5907
5908       /* The encoding has 2 interleaved stepped patterns.  */
5909       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5910       vec_perm_builder sel (nelt, 2, 3);
5911       sel.quick_grow (6);
5912       for (i = 0; i < 3; i++)
5913         {
5914           sel[i * 2] = i;
5915           sel[i * 2 + 1] = i + nelt;
5916         }
5917         vec_perm_indices indices (sel, 2, nelt);
5918         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5919
5920         for (i = 0; i < 6; i++)
5921           sel[i] += exact_div (nelt, 2);
5922         indices.new_vector (sel, 2, nelt);
5923         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5924
5925         for (i = 0, n = log_length; i < n; i++)
5926           {
5927             for (j = 0; j < length/2; j++)
5928               {
5929                 vect1 = dr_chain[j];
5930                 vect2 = dr_chain[j+length/2];
5931
5932                 /* Create interleaving stmt:
5933                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5934                                                         ...}>  */
5935                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5936                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5937                                                  vect2, perm_mask_high);
5938                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5939                 (*result_chain)[2*j] = high;
5940
5941                 /* Create interleaving stmt:
5942                    low = VEC_PERM_EXPR <vect1, vect2,
5943                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5944                                          ...}>  */
5945                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5946                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5947                                                  vect2, perm_mask_low);
5948                 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
5949                 (*result_chain)[2*j+1] = low;
5950               }
5951             memcpy (dr_chain.address (), result_chain->address (),
5952                     length * sizeof (tree));
5953           }
5954     }
5955 }
5956
5957 /* Function vect_setup_realignment
5958
5959    This function is called when vectorizing an unaligned load using
5960    the dr_explicit_realign[_optimized] scheme.
5961    This function generates the following code at the loop prolog:
5962
5963       p = initial_addr;
5964    x  msq_init = *(floor(p));   # prolog load
5965       realignment_token = call target_builtin;
5966     loop:
5967    x  msq = phi (msq_init, ---)
5968
5969    The stmts marked with x are generated only for the case of
5970    dr_explicit_realign_optimized.
5971
5972    The code above sets up a new (vector) pointer, pointing to the first
5973    location accessed by STMT_INFO, and a "floor-aligned" load using that
5974    pointer.  It also generates code to compute the "realignment-token"
5975    (if the relevant target hook was defined), and creates a phi-node at the
5976    loop-header bb whose arguments are the result of the prolog-load (created
5977    by this function) and the result of a load that takes place in the loop
5978    (to be created by the caller to this function).
5979
5980    For the case of dr_explicit_realign_optimized:
5981    The caller to this function uses the phi-result (msq) to create the
5982    realignment code inside the loop, and sets up the missing phi argument,
5983    as follows:
5984     loop:
5985       msq = phi (msq_init, lsq)
5986       lsq = *(floor(p'));        # load in loop
5987       result = realign_load (msq, lsq, realignment_token);
5988
5989    For the case of dr_explicit_realign:
5990     loop:
5991       msq = *(floor(p));        # load in loop
5992       p' = p + (VS-1);
5993       lsq = *(floor(p'));       # load in loop
5994       result = realign_load (msq, lsq, realignment_token);
5995
5996    Input:
5997    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5998                a memory location that may be unaligned.
5999    BSI - place where new code is to be inserted.
6000    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6001                               is used.
6002
6003    Output:
6004    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6005                        target hook, if defined.
6006    Return value - the result of the loop-header phi node.  */
6007
6008 tree
6009 vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6010                         gimple_stmt_iterator *gsi, tree *realignment_token,
6011                         enum dr_alignment_support alignment_support_scheme,
6012                         tree init_addr,
6013                         class loop **at_loop)
6014 {
6015   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6016   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6017   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6018   struct data_reference *dr = dr_info->dr;
6019   class loop *loop = NULL;
6020   edge pe = NULL;
6021   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6022   tree vec_dest;
6023   gimple *inc;
6024   tree ptr;
6025   tree data_ref;
6026   basic_block new_bb;
6027   tree msq_init = NULL_TREE;
6028   tree new_temp;
6029   gphi *phi_stmt;
6030   tree msq = NULL_TREE;
6031   gimple_seq stmts = NULL;
6032   bool compute_in_loop = false;
6033   bool nested_in_vect_loop = false;
6034   class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6035   class loop *loop_for_initial_load = NULL;
6036
6037   if (loop_vinfo)
6038     {
6039       loop = LOOP_VINFO_LOOP (loop_vinfo);
6040       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6041     }
6042
6043   gcc_assert (alignment_support_scheme == dr_explicit_realign
6044               || alignment_support_scheme == dr_explicit_realign_optimized);
6045
6046   /* We need to generate three things:
6047      1. the misalignment computation
6048      2. the extra vector load (for the optimized realignment scheme).
6049      3. the phi node for the two vectors from which the realignment is
6050       done (for the optimized realignment scheme).  */
6051
6052   /* 1. Determine where to generate the misalignment computation.
6053
6054      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6055      calculation will be generated by this function, outside the loop (in the
6056      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
6057      caller, inside the loop.
6058
6059      Background: If the misalignment remains fixed throughout the iterations of
6060      the loop, then both realignment schemes are applicable, and also the
6061      misalignment computation can be done outside LOOP.  This is because we are
6062      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6063      are a multiple of VS (the Vector Size), and therefore the misalignment in
6064      different vectorized LOOP iterations is always the same.
6065      The problem arises only if the memory access is in an inner-loop nested
6066      inside LOOP, which is now being vectorized using outer-loop vectorization.
6067      This is the only case when the misalignment of the memory access may not
6068      remain fixed throughout the iterations of the inner-loop (as explained in
6069      detail in vect_supportable_dr_alignment).  In this case, not only is the
6070      optimized realignment scheme not applicable, but also the misalignment
6071      computation (and generation of the realignment token that is passed to
6072      REALIGN_LOAD) have to be done inside the loop.
6073
6074      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6075      or not, which in turn determines if the misalignment is computed inside
6076      the inner-loop, or outside LOOP.  */
6077
6078   if (init_addr != NULL_TREE || !loop_vinfo)
6079     {
6080       compute_in_loop = true;
6081       gcc_assert (alignment_support_scheme == dr_explicit_realign);
6082     }
6083
6084
6085   /* 2. Determine where to generate the extra vector load.
6086
6087      For the optimized realignment scheme, instead of generating two vector
6088      loads in each iteration, we generate a single extra vector load in the
6089      preheader of the loop, and in each iteration reuse the result of the
6090      vector load from the previous iteration.  In case the memory access is in
6091      an inner-loop nested inside LOOP, which is now being vectorized using
6092      outer-loop vectorization, we need to determine whether this initial vector
6093      load should be generated at the preheader of the inner-loop, or can be
6094      generated at the preheader of LOOP.  If the memory access has no evolution
6095      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6096      to be generated inside LOOP (in the preheader of the inner-loop).  */
6097
6098   if (nested_in_vect_loop)
6099     {
6100       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6101       bool invariant_in_outerloop =
6102             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6103       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6104     }
6105   else
6106     loop_for_initial_load = loop;
6107   if (at_loop)
6108     *at_loop = loop_for_initial_load;
6109
6110   tree vuse = NULL_TREE;
6111   if (loop_for_initial_load)
6112     {
6113       pe = loop_preheader_edge (loop_for_initial_load);
6114       if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6115         vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6116     }
6117   if (!vuse)
6118     vuse = gimple_vuse (gsi_stmt (*gsi));
6119
6120   /* 3. For the case of the optimized realignment, create the first vector
6121       load at the loop preheader.  */
6122
6123   if (alignment_support_scheme == dr_explicit_realign_optimized)
6124     {
6125       /* Create msq_init = *(floor(p1)) in the loop preheader  */
6126       gassign *new_stmt;
6127
6128       gcc_assert (!compute_in_loop);
6129       vec_dest = vect_create_destination_var (scalar_dest, vectype);
6130       ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6131                                       loop_for_initial_load, NULL_TREE,
6132                                       &init_addr, NULL, &inc, true);
6133       if (TREE_CODE (ptr) == SSA_NAME)
6134         new_temp = copy_ssa_name (ptr);
6135       else
6136         new_temp = make_ssa_name (TREE_TYPE (ptr));
6137       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6138       tree type = TREE_TYPE (ptr);
6139       new_stmt = gimple_build_assign
6140                    (new_temp, BIT_AND_EXPR, ptr,
6141                     fold_build2 (MINUS_EXPR, type,
6142                                  build_int_cst (type, 0),
6143                                  build_int_cst (type, align)));
6144       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6145       gcc_assert (!new_bb);
6146       data_ref
6147         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6148                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6149       vect_copy_ref_info (data_ref, DR_REF (dr));
6150       new_stmt = gimple_build_assign (vec_dest, data_ref);
6151       new_temp = make_ssa_name (vec_dest, new_stmt);
6152       gimple_assign_set_lhs (new_stmt, new_temp);
6153       gimple_set_vuse (new_stmt, vuse);
6154       if (pe)
6155         {
6156           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6157           gcc_assert (!new_bb);
6158         }
6159       else
6160          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6161
6162       msq_init = gimple_assign_lhs (new_stmt);
6163     }
6164
6165   /* 4. Create realignment token using a target builtin, if available.
6166       It is done either inside the containing loop, or before LOOP (as
6167       determined above).  */
6168
6169   if (targetm.vectorize.builtin_mask_for_load)
6170     {
6171       gcall *new_stmt;
6172       tree builtin_decl;
6173
6174       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
6175       if (!init_addr)
6176         {
6177           /* Generate the INIT_ADDR computation outside LOOP.  */
6178           init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6179                                                             stmt_info, &stmts,
6180                                                             NULL_TREE);
6181           if (loop)
6182             {
6183               pe = loop_preheader_edge (loop);
6184               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6185               gcc_assert (!new_bb);
6186             }
6187           else
6188              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6189         }
6190
6191       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6192       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6193       vec_dest =
6194         vect_create_destination_var (scalar_dest,
6195                                      gimple_call_return_type (new_stmt));
6196       new_temp = make_ssa_name (vec_dest, new_stmt);
6197       gimple_call_set_lhs (new_stmt, new_temp);
6198
6199       if (compute_in_loop)
6200         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6201       else
6202         {
6203           /* Generate the misalignment computation outside LOOP.  */
6204           pe = loop_preheader_edge (loop);
6205           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6206           gcc_assert (!new_bb);
6207         }
6208
6209       *realignment_token = gimple_call_lhs (new_stmt);
6210
6211       /* The result of the CALL_EXPR to this builtin is determined from
6212          the value of the parameter and no global variables are touched
6213          which makes the builtin a "const" function.  Requiring the
6214          builtin to have the "const" attribute makes it unnecessary
6215          to call mark_call_clobbered.  */
6216       gcc_assert (TREE_READONLY (builtin_decl));
6217     }
6218
6219   if (alignment_support_scheme == dr_explicit_realign)
6220     return msq;
6221
6222   gcc_assert (!compute_in_loop);
6223   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6224
6225
6226   /* 5. Create msq = phi <msq_init, lsq> in loop  */
6227
6228   pe = loop_preheader_edge (containing_loop);
6229   vec_dest = vect_create_destination_var (scalar_dest, vectype);
6230   msq = make_ssa_name (vec_dest);
6231   phi_stmt = create_phi_node (msq, containing_loop->header);
6232   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6233
6234   return msq;
6235 }
6236
6237
6238 /* Function vect_grouped_load_supported.
6239
6240    COUNT is the size of the load group (the number of statements plus the
6241    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
6242    only one statement, with a gap of COUNT - 1.
6243
6244    Returns true if a suitable permute exists.  */
6245
6246 bool
6247 vect_grouped_load_supported (tree vectype, bool single_element_p,
6248                              unsigned HOST_WIDE_INT count)
6249 {
6250   machine_mode mode = TYPE_MODE (vectype);
6251
6252   /* If this is single-element interleaving with an element distance
6253      that leaves unused vector loads around punt - we at least create
6254      very sub-optimal code in that case (and blow up memory,
6255      see PR65518).  */
6256   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6257     {
6258       if (dump_enabled_p ())
6259         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6260                          "single-element interleaving not supported "
6261                          "for not adjacent vector loads\n");
6262       return false;
6263     }
6264
6265   /* vect_permute_load_chain requires the group size to be equal to 3 or
6266      be a power of two.  */
6267   if (count != 3 && exact_log2 (count) == -1)
6268     {
6269       if (dump_enabled_p ())
6270         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6271                          "the size of the group of accesses"
6272                          " is not a power of 2 or not equal to 3\n");
6273       return false;
6274     }
6275
6276   /* Check that the permutation is supported.  */
6277   if (VECTOR_MODE_P (mode))
6278     {
6279       unsigned int i, j;
6280       if (count == 3)
6281         {
6282           unsigned int nelt;
6283           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6284             {
6285               if (dump_enabled_p ())
6286                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6287                                  "cannot handle groups of 3 loads for"
6288                                  " variable-length vectors\n");
6289               return false;
6290             }
6291
6292           vec_perm_builder sel (nelt, nelt, 1);
6293           sel.quick_grow (nelt);
6294           vec_perm_indices indices;
6295           unsigned int k;
6296           for (k = 0; k < 3; k++)
6297             {
6298               for (i = 0; i < nelt; i++)
6299                 if (3 * i + k < 2 * nelt)
6300                   sel[i] = 3 * i + k;
6301                 else
6302                   sel[i] = 0;
6303               indices.new_vector (sel, 2, nelt);
6304               if (!can_vec_perm_const_p (mode, mode, indices))
6305                 {
6306                   if (dump_enabled_p ())
6307                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6308                                      "shuffle of 3 loads is not supported by"
6309                                      " target\n");
6310                   return false;
6311                 }
6312               for (i = 0, j = 0; i < nelt; i++)
6313                 if (3 * i + k < 2 * nelt)
6314                   sel[i] = i;
6315                 else
6316                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6317               indices.new_vector (sel, 2, nelt);
6318               if (!can_vec_perm_const_p (mode, mode, indices))
6319                 {
6320                   if (dump_enabled_p ())
6321                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6322                                      "shuffle of 3 loads is not supported by"
6323                                      " target\n");
6324                   return false;
6325                 }
6326             }
6327           return true;
6328         }
6329       else
6330         {
6331           /* If length is not equal to 3 then only power of 2 is supported.  */
6332           gcc_assert (pow2p_hwi (count));
6333           poly_uint64 nelt = GET_MODE_NUNITS (mode);
6334
6335           /* The encoding has a single stepped pattern.  */
6336           vec_perm_builder sel (nelt, 1, 3);
6337           sel.quick_grow (3);
6338           for (i = 0; i < 3; i++)
6339             sel[i] = i * 2;
6340           vec_perm_indices indices (sel, 2, nelt);
6341           if (can_vec_perm_const_p (mode, mode, indices))
6342             {
6343               for (i = 0; i < 3; i++)
6344                 sel[i] = i * 2 + 1;
6345               indices.new_vector (sel, 2, nelt);
6346               if (can_vec_perm_const_p (mode, mode, indices))
6347                 return true;
6348             }
6349         }
6350     }
6351
6352   if (dump_enabled_p ())
6353     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6354                      "extract even/odd not supported by target\n");
6355   return false;
6356 }
6357
6358 /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6359    of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
6360
6361 internal_fn
6362 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6363                            bool masked_p)
6364 {
6365   if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6366                                     vec_mask_len_load_lanes_optab, vectype,
6367                                     count))
6368     return IFN_MASK_LEN_LOAD_LANES;
6369   else if (masked_p)
6370     {
6371       if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6372                                         vec_mask_load_lanes_optab, vectype,
6373                                         count))
6374         return IFN_MASK_LOAD_LANES;
6375     }
6376   else
6377     {
6378       if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6379                                         vectype, count))
6380         return IFN_LOAD_LANES;
6381     }
6382   return IFN_LAST;
6383 }
6384
6385 /* Function vect_permute_load_chain.
6386
6387    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6388    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6389    the input data correctly.  Return the final references for loads in
6390    RESULT_CHAIN.
6391
6392    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6393    The input is 4 vectors each containing 8 elements. We assign a number to each
6394    element, the input sequence is:
6395
6396    1st vec:   0  1  2  3  4  5  6  7
6397    2nd vec:   8  9 10 11 12 13 14 15
6398    3rd vec:  16 17 18 19 20 21 22 23
6399    4th vec:  24 25 26 27 28 29 30 31
6400
6401    The output sequence should be:
6402
6403    1st vec:  0 4  8 12 16 20 24 28
6404    2nd vec:  1 5  9 13 17 21 25 29
6405    3rd vec:  2 6 10 14 18 22 26 30
6406    4th vec:  3 7 11 15 19 23 27 31
6407
6408    i.e., the first output vector should contain the first elements of each
6409    interleaving group, etc.
6410
6411    We use extract_even/odd instructions to create such output.  The input of
6412    each extract_even/odd operation is two vectors
6413    1st vec    2nd vec
6414    0 1 2 3    4 5 6 7
6415
6416    and the output is the vector of extracted even/odd elements.  The output of
6417    extract_even will be:   0 2 4 6
6418    and of extract_odd:     1 3 5 7
6419
6420
6421    The permutation is done in log LENGTH stages.  In each stage extract_even
6422    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6423    their order.  In our example,
6424
6425    E1: extract_even (1st vec, 2nd vec)
6426    E2: extract_odd (1st vec, 2nd vec)
6427    E3: extract_even (3rd vec, 4th vec)
6428    E4: extract_odd (3rd vec, 4th vec)
6429
6430    The output for the first stage will be:
6431
6432    E1:  0  2  4  6  8 10 12 14
6433    E2:  1  3  5  7  9 11 13 15
6434    E3: 16 18 20 22 24 26 28 30
6435    E4: 17 19 21 23 25 27 29 31
6436
6437    In order to proceed and create the correct sequence for the next stage (or
6438    for the correct output, if the second stage is the last one, as in our
6439    example), we first put the output of extract_even operation and then the
6440    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6441    The input for the second stage is:
6442
6443    1st vec (E1):  0  2  4  6  8 10 12 14
6444    2nd vec (E3): 16 18 20 22 24 26 28 30
6445    3rd vec (E2):  1  3  5  7  9 11 13 15
6446    4th vec (E4): 17 19 21 23 25 27 29 31
6447
6448    The output of the second stage:
6449
6450    E1: 0 4  8 12 16 20 24 28
6451    E2: 2 6 10 14 18 22 26 30
6452    E3: 1 5  9 13 17 21 25 29
6453    E4: 3 7 11 15 19 23 27 31
6454
6455    And RESULT_CHAIN after reordering:
6456
6457    1st vec (E1):  0 4  8 12 16 20 24 28
6458    2nd vec (E3):  1 5  9 13 17 21 25 29
6459    3rd vec (E2):  2 6 10 14 18 22 26 30
6460    4th vec (E4):  3 7 11 15 19 23 27 31.  */
6461
6462 static void
6463 vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6464                          unsigned int length,
6465                          stmt_vec_info stmt_info,
6466                          gimple_stmt_iterator *gsi,
6467                          vec<tree> *result_chain)
6468 {
6469   tree data_ref, first_vect, second_vect;
6470   tree perm_mask_even, perm_mask_odd;
6471   tree perm3_mask_low, perm3_mask_high;
6472   gimple *perm_stmt;
6473   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6474   unsigned int i, j, log_length = exact_log2 (length);
6475
6476   result_chain->quick_grow (length);
6477   memcpy (result_chain->address (), dr_chain.address (),
6478           length * sizeof (tree));
6479
6480   if (length == 3)
6481     {
6482       /* vect_grouped_load_supported ensures that this is constant.  */
6483       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6484       unsigned int k;
6485
6486       vec_perm_builder sel (nelt, nelt, 1);
6487       sel.quick_grow (nelt);
6488       vec_perm_indices indices;
6489       for (k = 0; k < 3; k++)
6490         {
6491           for (i = 0; i < nelt; i++)
6492             if (3 * i + k < 2 * nelt)
6493               sel[i] = 3 * i + k;
6494             else
6495               sel[i] = 0;
6496           indices.new_vector (sel, 2, nelt);
6497           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6498
6499           for (i = 0, j = 0; i < nelt; i++)
6500             if (3 * i + k < 2 * nelt)
6501               sel[i] = i;
6502             else
6503               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6504           indices.new_vector (sel, 2, nelt);
6505           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6506
6507           first_vect = dr_chain[0];
6508           second_vect = dr_chain[1];
6509
6510           /* Create interleaving stmt (low part of):
6511              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6512                                                              ...}>  */
6513           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6514           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6515                                            second_vect, perm3_mask_low);
6516           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6517
6518           /* Create interleaving stmt (high part of):
6519              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6520                                                               ...}>  */
6521           first_vect = data_ref;
6522           second_vect = dr_chain[2];
6523           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6524           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6525                                            second_vect, perm3_mask_high);
6526           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6527           (*result_chain)[k] = data_ref;
6528         }
6529     }
6530   else
6531     {
6532       /* If length is not equal to 3 then only power of 2 is supported.  */
6533       gcc_assert (pow2p_hwi (length));
6534
6535       /* The encoding has a single stepped pattern.  */
6536       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6537       vec_perm_builder sel (nelt, 1, 3);
6538       sel.quick_grow (3);
6539       for (i = 0; i < 3; ++i)
6540         sel[i] = i * 2;
6541       vec_perm_indices indices (sel, 2, nelt);
6542       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6543
6544       for (i = 0; i < 3; ++i)
6545         sel[i] = i * 2 + 1;
6546       indices.new_vector (sel, 2, nelt);
6547       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6548
6549       for (i = 0; i < log_length; i++)
6550         {
6551           for (j = 0; j < length; j += 2)
6552             {
6553               first_vect = dr_chain[j];
6554               second_vect = dr_chain[j+1];
6555
6556               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6557               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6558               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6559                                                first_vect, second_vect,
6560                                                perm_mask_even);
6561               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6562               (*result_chain)[j/2] = data_ref;
6563
6564               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6565               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6566               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6567                                                first_vect, second_vect,
6568                                                perm_mask_odd);
6569               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6570               (*result_chain)[j/2+length/2] = data_ref;
6571             }
6572           memcpy (dr_chain.address (), result_chain->address (),
6573                   length * sizeof (tree));
6574         }
6575     }
6576 }
6577
6578 /* Function vect_shift_permute_load_chain.
6579
6580    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6581    sequence of stmts to reorder the input data accordingly.
6582    Return the final references for loads in RESULT_CHAIN.
6583    Return true if successed, false otherwise.
6584
6585    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6586    The input is 3 vectors each containing 8 elements.  We assign a
6587    number to each element, the input sequence is:
6588
6589    1st vec:   0  1  2  3  4  5  6  7
6590    2nd vec:   8  9 10 11 12 13 14 15
6591    3rd vec:  16 17 18 19 20 21 22 23
6592
6593    The output sequence should be:
6594
6595    1st vec:  0 3 6  9 12 15 18 21
6596    2nd vec:  1 4 7 10 13 16 19 22
6597    3rd vec:  2 5 8 11 14 17 20 23
6598
6599    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6600
6601    First we shuffle all 3 vectors to get correct elements order:
6602
6603    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6604    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6605    3rd vec:  (16 19 22) (17 20 23) (18 21)
6606
6607    Next we unite and shift vector 3 times:
6608
6609    1st step:
6610      shift right by 6 the concatenation of:
6611      "1st vec" and  "2nd vec"
6612        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6613      "2nd vec" and  "3rd vec"
6614        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6615      "3rd vec" and  "1st vec"
6616        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6617                              | New vectors                   |
6618
6619      So that now new vectors are:
6620
6621      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6622      2nd vec:  (10 13) (16 19 22) (17 20 23)
6623      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6624
6625    2nd step:
6626      shift right by 5 the concatenation of:
6627      "1st vec" and  "3rd vec"
6628        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6629      "2nd vec" and  "1st vec"
6630        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6631      "3rd vec" and  "2nd vec"
6632        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6633                           | New vectors                   |
6634
6635      So that now new vectors are:
6636
6637      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6638      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6639      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6640
6641    3rd step:
6642      shift right by 5 the concatenation of:
6643      "1st vec" and  "1st vec"
6644        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6645      shift right by 3 the concatenation of:
6646      "2nd vec" and  "2nd vec"
6647                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6648                           | New vectors                   |
6649
6650      So that now all vectors are READY:
6651      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6652      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6653      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6654
6655    This algorithm is faster than one in vect_permute_load_chain if:
6656      1.  "shift of a concatination" is faster than general permutation.
6657          This is usually so.
6658      2.  The TARGET machine can't execute vector instructions in parallel.
6659          This is because each step of the algorithm depends on previous.
6660          The algorithm in vect_permute_load_chain is much more parallel.
6661
6662    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6663 */
6664
6665 static bool
6666 vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6667                                unsigned int length,
6668                                stmt_vec_info stmt_info,
6669                                gimple_stmt_iterator *gsi,
6670                                vec<tree> *result_chain)
6671 {
6672   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6673   tree perm2_mask1, perm2_mask2, perm3_mask;
6674   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6675   gimple *perm_stmt;
6676
6677   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6678   machine_mode vmode = TYPE_MODE (vectype);
6679   unsigned int i;
6680   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6681
6682   unsigned HOST_WIDE_INT nelt, vf;
6683   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6684       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6685     /* Not supported for variable-length vectors.  */
6686     return false;
6687
6688   vec_perm_builder sel (nelt, nelt, 1);
6689   sel.quick_grow (nelt);
6690
6691   result_chain->quick_grow (length);
6692   memcpy (result_chain->address (), dr_chain.address (),
6693           length * sizeof (tree));
6694
6695   if (pow2p_hwi (length) && vf > 4)
6696     {
6697       unsigned int j, log_length = exact_log2 (length);
6698       for (i = 0; i < nelt / 2; ++i)
6699         sel[i] = i * 2;
6700       for (i = 0; i < nelt / 2; ++i)
6701         sel[nelt / 2 + i] = i * 2 + 1;
6702       vec_perm_indices indices (sel, 2, nelt);
6703       if (!can_vec_perm_const_p (vmode, vmode, indices))
6704         {
6705           if (dump_enabled_p ())
6706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6707                              "shuffle of 2 fields structure is not \
6708                               supported by target\n");
6709           return false;
6710         }
6711       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6712
6713       for (i = 0; i < nelt / 2; ++i)
6714         sel[i] = i * 2 + 1;
6715       for (i = 0; i < nelt / 2; ++i)
6716         sel[nelt / 2 + i] = i * 2;
6717       indices.new_vector (sel, 2, nelt);
6718       if (!can_vec_perm_const_p (vmode, vmode, indices))
6719         {
6720           if (dump_enabled_p ())
6721             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6722                              "shuffle of 2 fields structure is not \
6723                               supported by target\n");
6724           return false;
6725         }
6726       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6727
6728       /* Generating permutation constant to shift all elements.
6729          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6730       for (i = 0; i < nelt; i++)
6731         sel[i] = nelt / 2 + i;
6732       indices.new_vector (sel, 2, nelt);
6733       if (!can_vec_perm_const_p (vmode, vmode, indices))
6734         {
6735           if (dump_enabled_p ())
6736             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6737                              "shift permutation is not supported by target\n");
6738           return false;
6739         }
6740       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6741
6742       /* Generating permutation constant to select vector from 2.
6743          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6744       for (i = 0; i < nelt / 2; i++)
6745         sel[i] = i;
6746       for (i = nelt / 2; i < nelt; i++)
6747         sel[i] = nelt + i;
6748       indices.new_vector (sel, 2, nelt);
6749       if (!can_vec_perm_const_p (vmode, vmode, indices))
6750         {
6751           if (dump_enabled_p ())
6752             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753                              "select is not supported by target\n");
6754           return false;
6755         }
6756       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6757
6758       for (i = 0; i < log_length; i++)
6759         {
6760           for (j = 0; j < length; j += 2)
6761             {
6762               first_vect = dr_chain[j];
6763               second_vect = dr_chain[j + 1];
6764
6765               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6766               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6767                                                first_vect, first_vect,
6768                                                perm2_mask1);
6769               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6770               vect[0] = data_ref;
6771
6772               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6773               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6774                                                second_vect, second_vect,
6775                                                perm2_mask2);
6776               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6777               vect[1] = data_ref;
6778
6779               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6780               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6781                                                vect[0], vect[1], shift1_mask);
6782               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6783               (*result_chain)[j/2 + length/2] = data_ref;
6784
6785               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6786               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6787                                                vect[0], vect[1], select_mask);
6788               vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6789               (*result_chain)[j/2] = data_ref;
6790             }
6791           memcpy (dr_chain.address (), result_chain->address (),
6792                   length * sizeof (tree));
6793         }
6794       return true;
6795     }
6796   if (length == 3 && vf > 2)
6797     {
6798       unsigned int k = 0, l = 0;
6799
6800       /* Generating permutation constant to get all elements in rigth order.
6801          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6802       for (i = 0; i < nelt; i++)
6803         {
6804           if (3 * k + (l % 3) >= nelt)
6805             {
6806               k = 0;
6807               l += (3 - (nelt % 3));
6808             }
6809           sel[i] = 3 * k + (l % 3);
6810           k++;
6811         }
6812       vec_perm_indices indices (sel, 2, nelt);
6813       if (!can_vec_perm_const_p (vmode, vmode, indices))
6814         {
6815           if (dump_enabled_p ())
6816             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6817                              "shuffle of 3 fields structure is not \
6818                               supported by target\n");
6819           return false;
6820         }
6821       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6822
6823       /* Generating permutation constant to shift all elements.
6824          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6825       for (i = 0; i < nelt; i++)
6826         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6827       indices.new_vector (sel, 2, nelt);
6828       if (!can_vec_perm_const_p (vmode, vmode, indices))
6829         {
6830           if (dump_enabled_p ())
6831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6832                              "shift permutation is not supported by target\n");
6833           return false;
6834         }
6835       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6836
6837       /* Generating permutation constant to shift all elements.
6838          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6839       for (i = 0; i < nelt; i++)
6840         sel[i] = 2 * (nelt / 3) + 1 + i;
6841       indices.new_vector (sel, 2, nelt);
6842       if (!can_vec_perm_const_p (vmode, vmode, indices))
6843         {
6844           if (dump_enabled_p ())
6845             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6846                              "shift permutation is not supported by target\n");
6847           return false;
6848         }
6849       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6850
6851       /* Generating permutation constant to shift all elements.
6852          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6853       for (i = 0; i < nelt; i++)
6854         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6855       indices.new_vector (sel, 2, nelt);
6856       if (!can_vec_perm_const_p (vmode, vmode, indices))
6857         {
6858           if (dump_enabled_p ())
6859             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6860                              "shift permutation is not supported by target\n");
6861           return false;
6862         }
6863       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6864
6865       /* Generating permutation constant to shift all elements.
6866          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6867       for (i = 0; i < nelt; i++)
6868         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6869       indices.new_vector (sel, 2, nelt);
6870       if (!can_vec_perm_const_p (vmode, vmode, indices))
6871         {
6872           if (dump_enabled_p ())
6873             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6874                              "shift permutation is not supported by target\n");
6875           return false;
6876         }
6877       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6878
6879       for (k = 0; k < 3; k++)
6880         {
6881           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6882           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6883                                            dr_chain[k], dr_chain[k],
6884                                            perm3_mask);
6885           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6886           vect[k] = data_ref;
6887         }
6888
6889       for (k = 0; k < 3; k++)
6890         {
6891           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6892           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6893                                            vect[k % 3], vect[(k + 1) % 3],
6894                                            shift1_mask);
6895           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6896           vect_shift[k] = data_ref;
6897         }
6898
6899       for (k = 0; k < 3; k++)
6900         {
6901           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6902           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6903                                            vect_shift[(4 - k) % 3],
6904                                            vect_shift[(3 - k) % 3],
6905                                            shift2_mask);
6906           vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6907           vect[k] = data_ref;
6908         }
6909
6910       (*result_chain)[3 - (nelt % 3)] = vect[2];
6911
6912       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6913       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6914                                        vect[0], shift3_mask);
6915       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6916       (*result_chain)[nelt % 3] = data_ref;
6917
6918       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6919       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6920                                        vect[1], shift4_mask);
6921       vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6922       (*result_chain)[0] = data_ref;
6923       return true;
6924     }
6925   return false;
6926 }
6927
6928 /* Function vect_transform_grouped_load.
6929
6930    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6931    to perform their permutation and ascribe the result vectorized statements to
6932    the scalar statements.
6933 */
6934
6935 void
6936 vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
6937                              vec<tree> dr_chain,
6938                              int size, gimple_stmt_iterator *gsi)
6939 {
6940   machine_mode mode;
6941   vec<tree> result_chain = vNULL;
6942
6943   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6944      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6945      vectors, that are ready for vector computation.  */
6946   result_chain.create (size);
6947
6948   /* If reassociation width for vector type is 2 or greater target machine can
6949      execute 2 or more vector instructions in parallel.  Otherwise try to
6950      get chain for loads group using vect_shift_permute_load_chain.  */
6951   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6952   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6953       || pow2p_hwi (size)
6954       || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
6955                                          gsi, &result_chain))
6956     vect_permute_load_chain (vinfo, dr_chain,
6957                              size, stmt_info, gsi, &result_chain);
6958   vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
6959   result_chain.release ();
6960 }
6961
6962 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6963    generated as part of the vectorization of STMT_INFO.  Assign the statement
6964    for each vector to the associated scalar statement.  */
6965
6966 void
6967 vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
6968                                   vec<tree> result_chain)
6969 {
6970   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6971   unsigned int i, gap_count;
6972   tree tmp_data_ref;
6973
6974   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6975      Since we scan the chain starting from it's first node, their order
6976      corresponds the order of data-refs in RESULT_CHAIN.  */
6977   stmt_vec_info next_stmt_info = first_stmt_info;
6978   gap_count = 1;
6979   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6980     {
6981       if (!next_stmt_info)
6982         break;
6983
6984       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6985        code elimination pass later.  No need to check for the first stmt in
6986        the group, since it always exists.
6987        DR_GROUP_GAP is the number of steps in elements from the previous
6988        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6989        correspond to the gaps.  */
6990       if (next_stmt_info != first_stmt_info
6991           && gap_count < DR_GROUP_GAP (next_stmt_info))
6992         {
6993           gap_count++;
6994           continue;
6995         }
6996
6997       /* ???  The following needs cleanup after the removal of
6998          DR_GROUP_SAME_DR_STMT.  */
6999       if (next_stmt_info)
7000         {
7001           gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7002           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7003              copies, and we put the new vector statement last.  */
7004           STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7005
7006           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7007           gap_count = 1;
7008         }
7009     }
7010 }
7011
7012 /* Function vect_force_dr_alignment_p.
7013
7014    Returns whether the alignment of a DECL can be forced to be aligned
7015    on ALIGNMENT bit boundary.  */
7016
7017 bool
7018 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7019 {
7020   if (!VAR_P (decl))
7021     return false;
7022
7023   if (decl_in_symtab_p (decl)
7024       && !symtab_node::get (decl)->can_increase_alignment_p ())
7025     return false;
7026
7027   if (TREE_STATIC (decl))
7028     return (known_le (alignment,
7029                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7030   else
7031     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7032 }
7033
7034 /* Return whether the data reference DR_INFO is supported with respect to its
7035    alignment.
7036    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7037    it is aligned, i.e., check if it is possible to vectorize it with different
7038    alignment.  */
7039
7040 enum dr_alignment_support
7041 vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7042                                tree vectype, int misalignment)
7043 {
7044   data_reference *dr = dr_info->dr;
7045   stmt_vec_info stmt_info = dr_info->stmt;
7046   machine_mode mode = TYPE_MODE (vectype);
7047   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7048   class loop *vect_loop = NULL;
7049   bool nested_in_vect_loop = false;
7050
7051   if (misalignment == 0)
7052     return dr_aligned;
7053
7054   /* For now assume all conditional loads/stores support unaligned
7055      access without any special code.  */
7056   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7057     if (gimple_call_internal_p (stmt)
7058         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7059             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7060       return dr_unaligned_supported;
7061
7062   if (loop_vinfo)
7063     {
7064       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
7065       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
7066     }
7067
7068   /* Possibly unaligned access.  */
7069
7070   /* We can choose between using the implicit realignment scheme (generating
7071      a misaligned_move stmt) and the explicit realignment scheme (generating
7072      aligned loads with a REALIGN_LOAD).  There are two variants to the
7073      explicit realignment scheme: optimized, and unoptimized.
7074      We can optimize the realignment only if the step between consecutive
7075      vector loads is equal to the vector size.  Since the vector memory
7076      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
7077      is guaranteed that the misalignment amount remains the same throughout the
7078      execution of the vectorized loop.  Therefore, we can create the
7079      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
7080      at the loop preheader.
7081
7082      However, in the case of outer-loop vectorization, when vectorizing a
7083      memory access in the inner-loop nested within the LOOP that is now being
7084      vectorized, while it is guaranteed that the misalignment of the
7085      vectorized memory access will remain the same in different outer-loop
7086      iterations, it is *not* guaranteed that is will remain the same throughout
7087      the execution of the inner-loop.  This is because the inner-loop advances
7088      with the original scalar step (and not in steps of VS).  If the inner-loop
7089      step happens to be a multiple of VS, then the misalignment remains fixed
7090      and we can use the optimized realignment scheme.  For example:
7091
7092       for (i=0; i<N; i++)
7093         for (j=0; j<M; j++)
7094           s += a[i+j];
7095
7096      When vectorizing the i-loop in the above example, the step between
7097      consecutive vector loads is 1, and so the misalignment does not remain
7098      fixed across the execution of the inner-loop, and the realignment cannot
7099      be optimized (as illustrated in the following pseudo vectorized loop):
7100
7101       for (i=0; i<N; i+=4)
7102         for (j=0; j<M; j++){
7103           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
7104                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
7105                          // (assuming that we start from an aligned address).
7106           }
7107
7108      We therefore have to use the unoptimized realignment scheme:
7109
7110       for (i=0; i<N; i+=4)
7111           for (j=k; j<M; j+=4)
7112           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
7113                            // that the misalignment of the initial address is
7114                            // 0).
7115
7116      The loop can then be vectorized as follows:
7117
7118       for (k=0; k<4; k++){
7119         rt = get_realignment_token (&vp[k]);
7120         for (i=0; i<N; i+=4){
7121           v1 = vp[i+k];
7122           for (j=k; j<M; j+=4){
7123             v2 = vp[i+j+VS-1];
7124             va = REALIGN_LOAD <v1,v2,rt>;
7125             vs += va;
7126             v1 = v2;
7127           }
7128         }
7129     } */
7130
7131   if (DR_IS_READ (dr))
7132     {
7133       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
7134           && (!targetm.vectorize.builtin_mask_for_load
7135               || targetm.vectorize.builtin_mask_for_load ()))
7136         {
7137           /* If we are doing SLP then the accesses need not have the
7138              same alignment, instead it depends on the SLP group size.  */
7139           if (loop_vinfo
7140               && STMT_SLP_TYPE (stmt_info)
7141               && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
7142                   || !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7143                                   * (DR_GROUP_SIZE
7144                                        (DR_GROUP_FIRST_ELEMENT (stmt_info))),
7145                                   TYPE_VECTOR_SUBPARTS (vectype))))
7146             ;
7147           else if (!loop_vinfo
7148                    || (nested_in_vect_loop
7149                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
7150                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
7151             return dr_explicit_realign;
7152           else
7153             return dr_explicit_realign_optimized;
7154         }
7155     }
7156
7157   bool is_packed = false;
7158   tree type = TREE_TYPE (DR_REF (dr));
7159   if (misalignment == DR_MISALIGNMENT_UNKNOWN)
7160     is_packed = not_size_aligned (DR_REF (dr));
7161   if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment,
7162                                                      is_packed))
7163     return dr_unaligned_supported;
7164
7165   /* Unsupported.  */
7166   return dr_unaligned_unsupported;
7167 }