gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 155             /* gimple_expr_type already picked the type of the loaded
 156                or stored data.  */
 157             i = ~0U;
 158           else if (internal_fn_mask_index (ifn) == 0)
 159             i = 1;
 160         }
 161       if (i < gimple_call_num_args (call))
 162         {
 163           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 164           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 165             {
 166               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 167               if (rhs < lhs)
 168                 scalar_type = rhs_type;
 169             }
 170         }
 171     }
 172
 173   *lhs_size_unit = lhs;
 174   *rhs_size_unit = rhs;
 175   return scalar_type;
 176 }
 177
 178
 179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 180    tested at run-time.  Return TRUE if DDR was successfully inserted.
 181    Return false if versioning is not supported.  */
 182
 183 static opt_result
 184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 185 {
 186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187
 188   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 189     return opt_result::failure_at (vect_location,
 190                                    "will not create alias checks, as"
 191                                    " --param vect-max-version-for-alias-checks"
 192                                    " == 0\n");
 193
 194   opt_result res
 195     = runtime_alias_check_p (ddr, loop,
 196                              optimize_loop_nest_for_speed_p (loop));
 197   if (!res)
 198     return res;
 199
 200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 201   return opt_result::success ();
 202 }
 203
 204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 205
 206 static void
 207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 208 {
 209   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 210   for (unsigned int i = 0; i < checks.length(); ++i)
 211     if (checks[i] == value)
 212       return;
 213
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location,
 216                      "need run-time check that %T is nonzero\n",
 217                      value);
 218   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 219 }
 220
 221 /* Return true if we know that the order of vectorized DR_INFO_A and
 222    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 223    DR_INFO_B.  At least one of the accesses is a write.  */
 224
 225 static bool
 226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 227 {
 228   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 229   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 230
 231   /* Single statements are always kept in their original order.  */
 232   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 233       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 234     return true;
 235
 236   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 237      SLP group are emitted at the position of the last scalar load and
 238      all loads in an interleaving group are emitted at the position
 239      of the first scalar load.
 240      Stores in a group are emitted at the position of the last scalar store.
 241      Compute that position and check whether the resulting order matches
 242      the current one.
 243      We have not yet decided between SLP and interleaving so we have
 244      to conservatively assume both.  */
 245   stmt_vec_info il_a;
 246   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 247   if (last_a)
 248     {
 249       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 250            s = DR_GROUP_NEXT_ELEMENT (s))
 251         last_a = get_later_stmt (last_a, s);
 252       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 253         {
 254           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 255                s = DR_GROUP_NEXT_ELEMENT (s))
 256             if (get_later_stmt (il_a, s) == il_a)
 257               il_a = s;
 258         }
 259       else
 260         il_a = last_a;
 261     }
 262   else
 263     last_a = il_a = stmtinfo_a;
 264   stmt_vec_info il_b;
 265   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 266   if (last_b)
 267     {
 268       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 269            s = DR_GROUP_NEXT_ELEMENT (s))
 270         last_b = get_later_stmt (last_b, s);
 271       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 272         {
 273           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 274                s = DR_GROUP_NEXT_ELEMENT (s))
 275             if (get_later_stmt (il_b, s) == il_b)
 276               il_b = s;
 277         }
 278       else
 279         il_b = last_b;
 280     }
 281   else
 282     last_b = il_b = stmtinfo_b;
 283   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 284   return (/* SLP */
 285           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 286           /* Interleaving */
 287           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 288           /* Mixed */
 289           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 290           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 291 }
 292
 293 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 294    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 295    distances.  These distances are conservatively correct but they don't
 296    reflect a guaranteed dependence.
 297
 298    Return true if this function does all the work necessary to avoid
 299    an alias or false if the caller should use the dependence distances
 300    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 301    the depth of the loop described by LOOP_VINFO and the other arguments
 302    are as for vect_analyze_data_ref_dependence.  */
 303
 304 static bool
 305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 306                                        loop_vec_info loop_vinfo,
 307                                        int loop_depth, unsigned int *max_vf)
 308 {
 309   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 310   lambda_vector dist_v;
 311   unsigned int i;
 312   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 313     {
 314       int dist = dist_v[loop_depth];
 315       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 316         {
 317           /* If the user asserted safelen >= DIST consecutive iterations
 318              can be executed concurrently, assume independence.
 319
 320              ??? An alternative would be to add the alias check even
 321              in this case, and vectorize the fallback loop with the
 322              maximum VF set to safelen.  However, if the user has
 323              explicitly given a length, it's less likely that that
 324              would be a win.  */
 325           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 326             {
 327               if ((unsigned int) loop->safelen < *max_vf)
 328                 *max_vf = loop->safelen;
 329               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 330               continue;
 331             }
 332
 333           /* For dependence distances of 2 or more, we have the option
 334              of limiting VF or checking for an alias at runtime.
 335              Prefer to check at runtime if we can, to avoid limiting
 336              the VF unnecessarily when the bases are in fact independent.
 337
 338              Note that the alias checks will be removed if the VF ends up
 339              being small enough.  */
 340           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 341           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 342           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 343                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 344                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 345         }
 346     }
 347   return true;
 348 }
 349
 350
 351 /* Function vect_analyze_data_ref_dependence.
 352
 353    FIXME: I needed to change the sense of the returned flag.
 354
 355    Return FALSE if there (might) exist a dependence between a memory-reference
 356    DRA and a memory-reference DRB.  When versioning for alias may check a
 357    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 358    the data dependence.  */
 359
 360 static opt_result
 361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 362                                   loop_vec_info loop_vinfo,
 363                                   unsigned int *max_vf)
 364 {
 365   unsigned int i;
 366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 367   struct data_reference *dra = DDR_A (ddr);
 368   struct data_reference *drb = DDR_B (ddr);
 369   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 370   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 371   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 372   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 373   lambda_vector dist_v;
 374   unsigned int loop_depth;
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   /* Unknown data dependence.  */
 411   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 412     {
 413       /* If user asserted safelen consecutive iterations can be
 414          executed concurrently, assume independence.  */
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return opt_result::success ();
 421         }
 422
 423       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 424           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 425         return opt_result::failure_at
 426           (stmtinfo_a->stmt,
 427            "versioning for alias not supported for: "
 428            "can't determine dependence between %T and %T\n",
 429            DR_REF (dra), DR_REF (drb));
 430
 431       if (dump_enabled_p ())
 432         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 433                          "versioning for alias required: "
 434                          "can't determine dependence between %T and %T\n",
 435                          DR_REF (dra), DR_REF (drb));
 436
 437       /* Add to list of ddrs that need to be tested at run-time.  */
 438       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 439     }
 440
 441   /* Known data dependence.  */
 442   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 443     {
 444       /* If user asserted safelen consecutive iterations can be
 445          executed concurrently, assume independence.  */
 446       if (loop->safelen >= 2)
 447         {
 448           if ((unsigned int) loop->safelen < *max_vf)
 449             *max_vf = loop->safelen;
 450           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 451           return opt_result::success ();
 452         }
 453
 454       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 455           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 456         return opt_result::failure_at
 457           (stmtinfo_a->stmt,
 458            "versioning for alias not supported for: "
 459            "bad dist vector for %T and %T\n",
 460            DR_REF (dra), DR_REF (drb));
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 464                          "versioning for alias required: "
 465                          "bad dist vector for %T and %T\n",
 466                          DR_REF (dra), DR_REF (drb));
 467       /* Add to list of ddrs that need to be tested at run-time.  */
 468       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 469     }
 470
 471   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 472
 473   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 474       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 475                                                 loop_depth, max_vf))
 476     return opt_result::success ();
 477
 478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 479     {
 480       int dist = dist_v[loop_depth];
 481
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_NOTE, vect_location,
 484                          "dependence distance  = %d.\n", dist);
 485
 486       if (dist == 0)
 487         {
 488           if (dump_enabled_p ())
 489             dump_printf_loc (MSG_NOTE, vect_location,
 490                              "dependence distance == 0 between %T and %T\n",
 491                              DR_REF (dra), DR_REF (drb));
 492
 493           /* When we perform grouped accesses and perform implicit CSE
 494              by detecting equal accesses and doing disambiguation with
 495              runtime alias tests like for
 496                 .. = a[i];
 497                 .. = a[i+1];
 498                 a[i] = ..;
 499                 a[i+1] = ..;
 500                 *p = ..;
 501                 .. = a[i];
 502                 .. = a[i+1];
 503              where we will end up loading { a[i], a[i+1] } once, make
 504              sure that inserting group loads before the first load and
 505              stores after the last store will do the right thing.
 506              Similar for groups like
 507                 a[i] = ...;
 508                 ... = a[i];
 509                 a[i+1] = ...;
 510              where loads from the group interleave with the store.  */
 511           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 512             return opt_result::failure_at (stmtinfo_a->stmt,
 513                                            "READ_WRITE dependence"
 514                                            " in interleaving.\n");
 515
 516           if (loop->safelen < 2)
 517             {
 518               tree indicator = dr_zero_step_indicator (dra);
 519               if (!indicator || integer_zerop (indicator))
 520                 return opt_result::failure_at (stmtinfo_a->stmt,
 521                                                "access also has a zero step\n");
 522               else if (TREE_CODE (indicator) != INTEGER_CST)
 523                 vect_check_nonzero_value (loop_vinfo, indicator);
 524             }
 525           continue;
 526         }
 527
 528       if (dist > 0 && DDR_REVERSED_P (ddr))
 529         {
 530           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 531              reversed (to make distance vector positive), and the actual
 532              distance is negative.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance negative.\n");
 536           /* When doing outer loop vectorization, we need to check if there is
 537              a backward dependence at the inner loop level if the dependence
 538              at the outer loop is reversed.  See PR81740.  */
 539           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 540               || nested_in_vect_loop_p (loop, stmtinfo_b))
 541             {
 542               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 543                                                          DDR_LOOP_NEST (ddr));
 544               if (dist_v[inner_depth] < 0)
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "not vectorized, dependence "
 547                                                "between data-refs %T and %T\n",
 548                                                DR_REF (dra), DR_REF (drb));
 549             }
 550           /* Record a negative dependence distance to later limit the
 551              amount of stmt copying / unrolling we can perform.
 552              Only need to handle read-after-write dependence.  */
 553           if (DR_IS_READ (drb)
 554               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 555                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 556             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 557           continue;
 558         }
 559
 560       unsigned int abs_dist = abs (dist);
 561       if (abs_dist >= 2 && abs_dist < *max_vf)
 562         {
 563           /* The dependence distance requires reduction of the maximal
 564              vectorization factor.  */
 565           *max_vf = abs_dist;
 566           if (dump_enabled_p ())
 567             dump_printf_loc (MSG_NOTE, vect_location,
 568                              "adjusting maximal vectorization factor to %i\n",
 569                              *max_vf);
 570         }
 571
 572       if (abs_dist >= *max_vf)
 573         {
 574           /* Dependence distance does not create dependence, as far as
 575              vectorization is concerned, in this case.  */
 576           if (dump_enabled_p ())
 577             dump_printf_loc (MSG_NOTE, vect_location,
 578                              "dependence distance >= VF.\n");
 579           continue;
 580         }
 581
 582       return opt_result::failure_at (stmtinfo_a->stmt,
 583                                      "not vectorized, possible dependence "
 584                                      "between data-refs %T and %T\n",
 585                                      DR_REF (dra), DR_REF (drb));
 586     }
 587
 588   return opt_result::success ();
 589 }
 590
 591 /* Function vect_analyze_data_ref_dependences.
 592
 593    Examine all the data references in the loop, and make sure there do not
 594    exist any data dependences between them.  Set *MAX_VF according to
 595    the maximum vectorization factor the data dependences allow.  */
 596
 597 opt_result
 598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 599                                    unsigned int *max_vf)
 600 {
 601   unsigned int i;
 602   struct data_dependence_relation *ddr;
 603
 604   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 605
 606   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 607     {
 608       LOOP_VINFO_DDRS (loop_vinfo)
 609         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 610                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 611       /* We need read-read dependences to compute
 612          STMT_VINFO_SAME_ALIGN_REFS.  */
 613       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 614                                           &LOOP_VINFO_DDRS (loop_vinfo),
 615                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 616                                           true);
 617       gcc_assert (res);
 618     }
 619
 620   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 621
 622   /* For epilogues we either have no aliases or alias versioning
 623      was applied to original loop.  Therefore we may just get max_vf
 624      using VF of original loop.  */
 625   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 626     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 627   else
 628     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 629       {
 630         opt_result res
 631           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 632         if (!res)
 633           return res;
 634       }
 635
 636   return opt_result::success ();
 637 }
 638
 639
 640 /* Function vect_slp_analyze_data_ref_dependence.
 641
 642    Return TRUE if there (might) exist a dependence between a memory-reference
 643    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 644    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 645    according to the data dependence.  */
 646
 647 static bool
 648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 649                                       struct data_dependence_relation *ddr)
 650 {
 651   struct data_reference *dra = DDR_A (ddr);
 652   struct data_reference *drb = DDR_B (ddr);
 653   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 654   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 655
 656   /* We need to check dependences of statements marked as unvectorizable
 657      as well, they still can prohibit vectorization.  */
 658
 659   /* Independent data accesses.  */
 660   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 661     return false;
 662
 663   if (dra == drb)
 664     return false;
 665
 666   /* Read-read is OK.  */
 667   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 668     return false;
 669
 670   /* If dra and drb are part of the same interleaving chain consider
 671      them independent.  */
 672   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 673       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 674           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 675     return false;
 676
 677   /* Unknown data dependence.  */
 678   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 679     {
 680       if  (dump_enabled_p ())
 681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 682                          "can't determine dependence between %T and %T\n",
 683                          DR_REF (dra), DR_REF (drb));
 684     }
 685   else if (dump_enabled_p ())
 686     dump_printf_loc (MSG_NOTE, vect_location,
 687                      "determined dependence between %T and %T\n",
 688                      DR_REF (dra), DR_REF (drb));
 689
 690   return true;
 691 }
 692
 693
 694 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 695    contain the vector of scalar stores of this instance if we are
 696    disambiguating the loads.  */
 697
 698 static bool
 699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 700                                    vec<stmt_vec_info> stores,
 701                                    stmt_vec_info last_store_info)
 702 {
 703   /* This walks over all stmts involved in the SLP load/store done
 704      in NODE verifying we can sink them up to the last stmt in the
 705      group.  */
 706   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 707   vec_info *vinfo = last_access_info->vinfo;
 708   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 709     {
 710       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 711       if (access_info == last_access_info)
 712         continue;
 713       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 714       ao_ref ref;
 715       bool ref_initialized_p = false;
 716       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 717            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 718         {
 719           gimple *stmt = gsi_stmt (gsi);
 720           if (! gimple_vuse (stmt)
 721               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 722             continue;
 723
 724           /* If we couldn't record a (single) data reference for this
 725              stmt we have to resort to the alias oracle.  */
 726           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 727           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 728           if (!dr_b)
 729             {
 730               /* We are moving a store or sinking a load - this means
 731                  we cannot use TBAA for disambiguation.  */
 732               if (!ref_initialized_p)
 733                 ao_ref_init (&ref, DR_REF (dr_a));
 734               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 735                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 736                 return false;
 737               continue;
 738             }
 739
 740           bool dependent = false;
 741           /* If we run into a store of this same instance (we've just
 742              marked those) then delay dependence checking until we run
 743              into the last store because this is where it will have
 744              been sunk to (and we verify if we can do that as well).  */
 745           if (gimple_visited_p (stmt))
 746             {
 747               if (stmt_info != last_store_info)
 748                 continue;
 749               unsigned i;
 750               stmt_vec_info store_info;
 751               FOR_EACH_VEC_ELT (stores, i, store_info)
 752                 {
 753                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 754                   ddr_p ddr = initialize_data_dependence_relation
 755                                 (dr_a, store_dr, vNULL);
 756                   dependent
 757                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 758                   free_dependence_relation (ddr);
 759                   if (dependent)
 760                     break;
 761                 }
 762             }
 763           else
 764             {
 765               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 766                                                                dr_b, vNULL);
 767               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 768               free_dependence_relation (ddr);
 769             }
 770           if (dependent)
 771             return false;
 772         }
 773     }
 774   return true;
 775 }
 776
 777
 778 /* Function vect_analyze_data_ref_dependences.
 779
 780    Examine all the data references in the basic-block, and make sure there
 781    do not exist any data dependences between them.  Set *MAX_VF according to
 782    the maximum vectorization factor the data dependences allow.  */
 783
 784 bool
 785 vect_slp_analyze_instance_dependence (slp_instance instance)
 786 {
 787   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 788
 789   /* The stores of this instance are at the root of the SLP tree.  */
 790   slp_tree store = SLP_INSTANCE_TREE (instance);
 791   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 792     store = NULL;
 793
 794   /* Verify we can sink stores to the vectorized stmt insert location.  */
 795   stmt_vec_info last_store_info = NULL;
 796   if (store)
 797     {
 798       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 799         return false;
 800
 801       /* Mark stores in this instance and remember the last one.  */
 802       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 803       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 804         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 805     }
 806
 807   bool res = true;
 808
 809   /* Verify we can sink loads to the vectorized stmt insert location,
 810      special-casing stores of this instance.  */
 811   slp_tree load;
 812   unsigned int i;
 813   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 814     if (! vect_slp_analyze_node_dependences (instance, load,
 815                                              store
 816                                              ? SLP_TREE_SCALAR_STMTS (store)
 817                                              : vNULL, last_store_info))
 818       {
 819         res = false;
 820         break;
 821       }
 822
 823   /* Unset the visited flag.  */
 824   if (store)
 825     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 826       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 827
 828   return res;
 829 }
 830
 831 /* Record the base alignment guarantee given by DRB, which occurs
 832    in STMT_INFO.  */
 833
 834 static void
 835 vect_record_base_alignment (stmt_vec_info stmt_info,
 836                             innermost_loop_behavior *drb)
 837 {
 838   vec_info *vinfo = stmt_info->vinfo;
 839   bool existed;
 840   innermost_loop_behavior *&entry
 841     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 842   if (!existed || entry->base_alignment < drb->base_alignment)
 843     {
 844       entry = drb;
 845       if (dump_enabled_p ())
 846         dump_printf_loc (MSG_NOTE, vect_location,
 847                          "recording new base alignment for %T\n"
 848                          "  alignment:    %d\n"
 849                          "  misalignment: %d\n"
 850                          "  based on:     %G",
 851                          drb->base_address,
 852                          drb->base_alignment,
 853                          drb->base_misalignment,
 854                          stmt_info->stmt);
 855     }
 856 }
 857
 858 /* If the region we're going to vectorize is reached, all unconditional
 859    data references occur at least once.  We can therefore pool the base
 860    alignment guarantees from each unconditional reference.  Do this by
 861    going through all the data references in VINFO and checking whether
 862    the containing statement makes the reference unconditionally.  If so,
 863    record the alignment of the base address in VINFO so that it can be
 864    used for all other references with the same base.  */
 865
 866 void
 867 vect_record_base_alignments (vec_info *vinfo)
 868 {
 869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 870   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 871   data_reference *dr;
 872   unsigned int i;
 873   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 874     {
 875       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 876       stmt_vec_info stmt_info = dr_info->stmt;
 877       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 878           && STMT_VINFO_VECTORIZABLE (stmt_info)
 879           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 880         {
 881           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 882
 883           /* If DR is nested in the loop that is being vectorized, we can also
 884              record the alignment of the base wrt the outer loop.  */
 885           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 886             vect_record_base_alignment
 887               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 888         }
 889     }
 890 }
 891
 892 /* Return the target alignment for the vectorized form of DR_INFO.  */
 893
 894 static poly_uint64
 895 vect_calculate_target_alignment (dr_vec_info *dr_info)
 896 {
 897   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 898   return targetm.vectorize.preferred_vector_alignment (vectype);
 899 }
 900
 901 /* Function vect_compute_data_ref_alignment
 902
 903    Compute the misalignment of the data reference DR_INFO.
 904
 905    Output:
 906    1. DR_MISALIGNMENT (DR_INFO) is defined.
 907
 908    FOR NOW: No analysis is actually performed. Misalignment is calculated
 909    only for trivial cases. TODO.  */
 910
 911 static void
 912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 913 {
 914   stmt_vec_info stmt_info = dr_info->stmt;
 915   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 916   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 917   struct loop *loop = NULL;
 918   tree ref = DR_REF (dr_info->dr);
 919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 920
 921   if (dump_enabled_p ())
 922     dump_printf_loc (MSG_NOTE, vect_location,
 923                      "vect_compute_data_ref_alignment:\n");
 924
 925   if (loop_vinfo)
 926     loop = LOOP_VINFO_LOOP (loop_vinfo);
 927
 928   /* Initialize misalignment to unknown.  */
 929   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 930
 931   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 932     return;
 933
 934   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 935   bool step_preserves_misalignment_p;
 936
 937   poly_uint64 vector_alignment
 938     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 939   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 940
 941   unsigned HOST_WIDE_INT vect_align_c;
 942   if (!vector_alignment.is_constant (&vect_align_c))
 943     return;
 944
 945   /* No step for BB vectorization.  */
 946   if (!loop)
 947     {
 948       gcc_assert (integer_zerop (drb->step));
 949       step_preserves_misalignment_p = true;
 950     }
 951
 952   /* In case the dataref is in an inner-loop of the loop that is being
 953      vectorized (LOOP), we use the base and misalignment information
 954      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 955      stays the same throughout the execution of the inner-loop, which is why
 956      we have to check that the stride of the dataref in the inner-loop evenly
 957      divides by the vector alignment.  */
 958   else if (nested_in_vect_loop_p (loop, stmt_info))
 959     {
 960       step_preserves_misalignment_p
 961         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 962
 963       if (dump_enabled_p ())
 964         {
 965           if (step_preserves_misalignment_p)
 966             dump_printf_loc (MSG_NOTE, vect_location,
 967                              "inner step divides the vector alignment.\n");
 968           else
 969             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 970                              "inner step doesn't divide the vector"
 971                              " alignment.\n");
 972         }
 973     }
 974
 975   /* Similarly we can only use base and misalignment information relative to
 976      an innermost loop if the misalignment stays the same throughout the
 977      execution of the loop.  As above, this is the case if the stride of
 978      the dataref evenly divides by the alignment.  */
 979   else
 980     {
 981       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 982       step_preserves_misalignment_p
 983         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 984
 985       if (!step_preserves_misalignment_p && dump_enabled_p ())
 986         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 987                          "step doesn't divide the vector alignment.\n");
 988     }
 989
 990   unsigned int base_alignment = drb->base_alignment;
 991   unsigned int base_misalignment = drb->base_misalignment;
 992
 993   /* Calculate the maximum of the pooled base address alignment and the
 994      alignment that we can compute for DR itself.  */
 995   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 996   if (entry && base_alignment < (*entry)->base_alignment)
 997     {
 998       base_alignment = (*entry)->base_alignment;
 999       base_misalignment = (*entry)->base_misalignment;
1000     }
1001
1002   if (drb->offset_alignment < vect_align_c
1003       || !step_preserves_misalignment_p
1004       /* We need to know whether the step wrt the vectorized loop is
1005          negative when computing the starting misalignment below.  */
1006       || TREE_CODE (drb->step) != INTEGER_CST)
1007     {
1008       if (dump_enabled_p ())
1009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010                          "Unknown alignment for access: %T\n", ref);
1011       return;
1012     }
1013
1014   if (base_alignment < vect_align_c)
1015     {
1016       unsigned int max_alignment;
1017       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018       if (max_alignment < vect_align_c
1019           || !vect_can_force_dr_alignment_p (base,
1020                                              vect_align_c * BITS_PER_UNIT))
1021         {
1022           if (dump_enabled_p ())
1023             dump_printf_loc (MSG_NOTE, vect_location,
1024                              "can't force alignment of ref: %T\n", ref);
1025           return;
1026         }
1027
1028       /* Force the alignment of the decl.
1029          NOTE: This is the only change to the code we make during
1030          the analysis phase, before deciding to vectorize the loop.  */
1031       if (dump_enabled_p ())
1032         dump_printf_loc (MSG_NOTE, vect_location,
1033                          "force alignment of %T\n", ref);
1034
1035       dr_info->base_decl = base;
1036       dr_info->base_misaligned = true;
1037       base_misalignment = 0;
1038     }
1039   poly_int64 misalignment
1040     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041
1042   /* If this is a backward running DR then first access in the larger
1043      vectype actually is N-1 elements before the address in the DR.
1044      Adjust misalign accordingly.  */
1045   if (tree_int_cst_sgn (drb->step) < 0)
1046     /* PLUS because STEP is negative.  */
1047     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048                      * TREE_INT_CST_LOW (drb->step));
1049
1050   unsigned int const_misalignment;
1051   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052     {
1053       if (dump_enabled_p ())
1054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                          "Non-constant misalignment for access: %T\n", ref);
1056       return;
1057     }
1058
1059   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060
1061   if (dump_enabled_p ())
1062     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063                      "misalign = %d bytes of ref %T\n",
1064                      DR_MISALIGNMENT (dr_info), ref);
1065
1066   return;
1067 }
1068
1069 /* Function vect_update_misalignment_for_peel.
1070    Sets DR_INFO's misalignment
1071    - to 0 if it has the same alignment as DR_PEEL_INFO,
1072    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073    - to -1 (unknown) otherwise.
1074
1075    DR_INFO - the data reference whose misalignment is to be adjusted.
1076    DR_PEEL_INFO - the data reference whose misalignment is being made
1077                   zero in the vector loop by the peel.
1078    NPEEL - the number of iterations in the peel loop if the misalignment
1079            of DR_PEEL_INFO is known at compile time.  */
1080
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083                                    dr_vec_info *dr_peel_info, int npeel)
1084 {
1085   unsigned int i;
1086   vec<dr_p> same_aligned_drs;
1087   struct data_reference *current_dr;
1088   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089
1090   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091      it is aligned in the vector loop.  */
1092   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094     {
1095       if (current_dr != dr_info->dr)
1096         continue;
1097       gcc_assert (!known_alignment_for_access_p (dr_info)
1098                   || !known_alignment_for_access_p (dr_peel_info)
1099                   || (DR_MISALIGNMENT (dr_info)
1100                       == DR_MISALIGNMENT (dr_peel_info)));
1101       SET_DR_MISALIGNMENT (dr_info, 0);
1102       return;
1103     }
1104
1105   unsigned HOST_WIDE_INT alignment;
1106   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107       && known_alignment_for_access_p (dr_info)
1108       && known_alignment_for_access_p (dr_peel_info))
1109     {
1110       int misal = DR_MISALIGNMENT (dr_info);
1111       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112       misal &= alignment - 1;
1113       SET_DR_MISALIGNMENT (dr_info, misal);
1114       return;
1115     }
1116
1117   if (dump_enabled_p ())
1118     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119                      "to unknown (-1).\n");
1120   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122
1123
1124 /* Function verify_data_ref_alignment
1125
1126    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1127
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131   enum dr_alignment_support supportable_dr_alignment
1132     = vect_supportable_dr_alignment (dr_info, false);
1133   if (!supportable_dr_alignment)
1134     return opt_result::failure_at
1135       (dr_info->stmt->stmt,
1136        DR_IS_READ (dr_info->dr)
1137         ? "not vectorized: unsupported unaligned load: %T\n"
1138         : "not vectorized: unsupported unaligned store: %T\n",
1139        DR_REF (dr_info->dr));
1140
1141   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142     dump_printf_loc (MSG_NOTE, vect_location,
1143                      "Vectorizing an unaligned access.\n");
1144
1145   return opt_result::success ();
1146 }
1147
1148 /* Function vect_verify_datarefs_alignment
1149
1150    Return TRUE if all data references in the loop can be
1151    handled with respect to alignment.  */
1152
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157   struct data_reference *dr;
1158   unsigned int i;
1159
1160   FOR_EACH_VEC_ELT (datarefs, i, dr)
1161     {
1162       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163       stmt_vec_info stmt_info = dr_info->stmt;
1164
1165       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166         continue;
1167
1168       /* For interleaving, only the alignment of the first access matters.   */
1169       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171         continue;
1172
1173       /* Strided accesses perform only component accesses, alignment is
1174          irrelevant for them.  */
1175       if (STMT_VINFO_STRIDED_P (stmt_info)
1176           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177         continue;
1178
1179       opt_result res = verify_data_ref_alignment (dr_info);
1180       if (!res)
1181         return res;
1182     }
1183
1184   return opt_result::success ();
1185 }
1186
1187 /* Given an memory reference EXP return whether its alignment is less
1188    than its size.  */
1189
1190 static bool
1191 not_size_aligned (tree exp)
1192 {
1193   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194     return true;
1195
1196   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197           > get_object_alignment (exp));
1198 }
1199
1200 /* Function vector_alignment_reachable_p
1201
1202    Return true if vector alignment for DR_INFO is reachable by peeling
1203    a few loop iterations.  Return false otherwise.  */
1204
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208   stmt_vec_info stmt_info = dr_info->stmt;
1209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210
1211   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212     {
1213       /* For interleaved access we peel only if number of iterations in
1214          the prolog loop ({VF - misalignment}), is a multiple of the
1215          number of the interleaved accesses.  */
1216       int elem_size, mis_in_elements;
1217
1218       /* FORNOW: handle only known alignment.  */
1219       if (!known_alignment_for_access_p (dr_info))
1220         return false;
1221
1222       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224       elem_size = vector_element_size (vector_size, nelements);
1225       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226
1227       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228         return false;
1229     }
1230
1231   /* If misalignment is known at the compile time then allow peeling
1232      only if natural alignment is reachable through peeling.  */
1233   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234     {
1235       HOST_WIDE_INT elmsize =
1236                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237       if (dump_enabled_p ())
1238         {
1239           dump_printf_loc (MSG_NOTE, vect_location,
1240                            "data size = %wd. misalignment = %d.\n", elmsize,
1241                            DR_MISALIGNMENT (dr_info));
1242         }
1243       if (DR_MISALIGNMENT (dr_info) % elmsize)
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "data size does not divide the misalignment.\n");
1248           return false;
1249         }
1250     }
1251
1252   if (!known_alignment_for_access_p (dr_info))
1253     {
1254       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256       if (dump_enabled_p ())
1257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                          "Unknown misalignment, %snaturally aligned\n",
1259                          is_packed ? "not " : "");
1260       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261     }
1262
1263   return true;
1264 }
1265
1266
1267 /* Calculate the cost of the memory access represented by DR_INFO.  */
1268
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271                            unsigned int *inside_cost,
1272                            unsigned int *outside_cost,
1273                            stmt_vector_for_cost *body_cost_vec,
1274                            stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276   stmt_vec_info stmt_info = dr_info->stmt;
1277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278   int ncopies;
1279
1280   if (PURE_SLP_STMT (stmt_info))
1281     ncopies = 1;
1282   else
1283     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285   if (DR_IS_READ (dr_info->dr))
1286     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287                         prologue_cost_vec, body_cost_vec, false);
1288   else
1289     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "vect_get_data_access_cost: inside_cost = %d, "
1294                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300   dr_vec_info *dr_info;
1301   int npeel;
1302   unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307   struct _vect_peel_info peel_info;
1308   unsigned int inside_cost;
1309   unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers.  */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317   static inline hashval_t hash (const _vect_peel_info *);
1318   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324   return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330   return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339                           int npeel)
1340 {
1341   struct _vect_peel_info elem, *slot;
1342   _vect_peel_info **new_slot;
1343   bool supportable_dr_alignment
1344     = vect_supportable_dr_alignment (dr_info, true);
1345
1346   elem.npeel = npeel;
1347   slot = peeling_htab->find (&elem);
1348   if (slot)
1349     slot->count++;
1350   else
1351     {
1352       slot = XNEW (struct _vect_peel_info);
1353       slot->npeel = npeel;
1354       slot->dr_info = dr_info;
1355       slot->count = 1;
1356       new_slot = peeling_htab->find_slot (slot, INSERT);
1357       *new_slot = slot;
1358     }
1359
1360   if (!supportable_dr_alignment
1361       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362     slot->count += VECT_MAX_COST;
1363 }
1364
1365
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367    number of data accesses.  */
1368
1369 int
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371                                      _vect_peel_extended_info *max)
1372 {
1373   vect_peel_info elem = *slot;
1374
1375   if (elem->count > max->peel_info.count
1376       || (elem->count == max->peel_info.count
1377           && max->peel_info.npeel > elem->npeel))
1378     {
1379       max->peel_info.npeel = elem->npeel;
1380       max->peel_info.count = elem->count;
1381       max->peel_info.dr_info = elem->dr_info;
1382     }
1383
1384   return 1;
1385 }
1386
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1389    we assume DR0_INFO's misalignment will be zero after peeling.  */
1390
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393                                 dr_vec_info *dr0_info,
1394                                 unsigned int *inside_cost,
1395                                 unsigned int *outside_cost,
1396                                 stmt_vector_for_cost *body_cost_vec,
1397                                 stmt_vector_for_cost *prologue_cost_vec,
1398                                 unsigned int npeel,
1399                                 bool unknown_misalignment)
1400 {
1401   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402   unsigned i;
1403   data_reference *dr;
1404
1405   FOR_EACH_VEC_ELT (datarefs, i, dr)
1406     {
1407       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408       stmt_vec_info stmt_info = dr_info->stmt;
1409       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410         continue;
1411
1412       /* For interleaving, only the alignment of the first access
1413          matters.  */
1414       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416         continue;
1417
1418       /* Strided accesses perform only component accesses, alignment is
1419          irrelevant for them.  */
1420       if (STMT_VINFO_STRIDED_P (stmt_info)
1421           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422         continue;
1423
1424       int save_misalignment;
1425       save_misalignment = DR_MISALIGNMENT (dr_info);
1426       if (npeel == 0)
1427         ;
1428       else if (unknown_misalignment && dr_info == dr0_info)
1429         SET_DR_MISALIGNMENT (dr_info, 0);
1430       else
1431         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433                                  body_cost_vec, prologue_cost_vec);
1434       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435     }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439    Find the one with the lowest cost.  */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443                                    _vect_peel_extended_info *min)
1444 {
1445   vect_peel_info elem = *slot;
1446   int dummy;
1447   unsigned int inside_cost = 0, outside_cost = 0;
1448   stmt_vec_info stmt_info = elem->dr_info->stmt;
1449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451                        epilogue_cost_vec;
1452
1453   prologue_cost_vec.create (2);
1454   body_cost_vec.create (2);
1455   epilogue_cost_vec.create (2);
1456
1457   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458                                   &outside_cost, &body_cost_vec,
1459                                   &prologue_cost_vec, elem->npeel, false);
1460
1461   body_cost_vec.release ();
1462
1463   outside_cost += vect_get_known_peeling_cost
1464     (loop_vinfo, elem->npeel, &dummy,
1465      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466      &prologue_cost_vec, &epilogue_cost_vec);
1467
1468   /* Prologue and epilogue costs are added to the target model later.
1469      These costs depend only on the scalar iteration cost, the
1470      number of peeling iterations finally chosen, and the number of
1471      misaligned statements.  So discard the information found here.  */
1472   prologue_cost_vec.release ();
1473   epilogue_cost_vec.release ();
1474
1475   if (inside_cost < min->inside_cost
1476       || (inside_cost == min->inside_cost
1477           && outside_cost < min->outside_cost))
1478     {
1479       min->inside_cost = inside_cost;
1480       min->outside_cost = outside_cost;
1481       min->peel_info.dr_info = elem->dr_info;
1482       min->peel_info.npeel = elem->npeel;
1483       min->peel_info.count = elem->count;
1484     }
1485
1486   return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491    choosing an option with the lowest cost (if cost model is enabled) or the
1492    option that aligns as many accesses as possible.  */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496                                        loop_vec_info loop_vinfo)
1497 {
1498    struct _vect_peel_extended_info res;
1499
1500    res.peel_info.dr_info = NULL;
1501
1502    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503      {
1504        res.inside_cost = INT_MAX;
1505        res.outside_cost = INT_MAX;
1506        peeling_htab->traverse <_vect_peel_extended_info *,
1507                                vect_peeling_hash_get_lowest_cost> (&res);
1508      }
1509    else
1510      {
1511        res.peel_info.count = 0;
1512        peeling_htab->traverse <_vect_peel_extended_info *,
1513                                vect_peeling_hash_get_most_frequent> (&res);
1514        res.inside_cost = 0;
1515        res.outside_cost = 0;
1516      }
1517
1518    return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported.  */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525                           unsigned npeel)
1526 {
1527   unsigned i;
1528   struct data_reference *dr = NULL;
1529   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530   enum dr_alignment_support supportable_dr_alignment;
1531
1532   /* Ensure that all data refs can be vectorized after the peel.  */
1533   FOR_EACH_VEC_ELT (datarefs, i, dr)
1534     {
1535       int save_misalignment;
1536
1537       if (dr == dr0_info->dr)
1538         continue;
1539
1540       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541       stmt_vec_info stmt_info = dr_info->stmt;
1542       /* For interleaving, only the alignment of the first access
1543          matters.  */
1544       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546         continue;
1547
1548       /* Strided accesses perform only component accesses, alignment is
1549          irrelevant for them.  */
1550       if (STMT_VINFO_STRIDED_P (stmt_info)
1551           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552         continue;
1553
1554       save_misalignment = DR_MISALIGNMENT (dr_info);
1555       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556       supportable_dr_alignment
1557         = vect_supportable_dr_alignment (dr_info, false);
1558       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559
1560       if (!supportable_dr_alignment)
1561         return false;
1562     }
1563
1564   return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569    This pass will use loop versioning and loop peeling in order to enhance
1570    the alignment of data references in the loop.
1571
1572    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573    original loop is to be vectorized.  Any other loops that are created by
1574    the transformations performed in this pass - are not supposed to be
1575    vectorized.  This restriction will be relaxed.
1576
1577    This pass will require a cost model to guide it whether to apply peeling
1578    or versioning or a combination of the two.  For example, the scheme that
1579    intel uses when given a loop with several memory accesses, is as follows:
1580    choose one memory access ('p') which alignment you want to force by doing
1581    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1582    other accesses are not necessarily aligned, or (2) use loop versioning to
1583    generate one loop in which all accesses are aligned, and another loop in
1584    which only 'p' is necessarily aligned.
1585
1586    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590    Devising a cost model is the most critical aspect of this work.  It will
1591    guide us on which access to peel for, whether to use loop versioning, how
1592    many versions to create, etc.  The cost model will probably consist of
1593    generic considerations as well as target specific considerations (on
1594    powerpc for example, misaligned stores are more painful than misaligned
1595    loads).
1596
1597    Here are the general steps involved in alignment enhancements:
1598
1599      -- original loop, before alignment analysis:
1600         for (i=0; i<N; i++){
1601           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1602           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1603         }
1604
1605      -- After vect_compute_data_refs_alignment:
1606         for (i=0; i<N; i++){
1607           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1608           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1609         }
1610
1611      -- Possibility 1: we do loop versioning:
1612      if (p is aligned) {
1613         for (i=0; i<N; i++){    # loop 1A
1614           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1615           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1616         }
1617      }
1618      else {
1619         for (i=0; i<N; i++){    # loop 1B
1620           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1621           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1622         }
1623      }
1624
1625      -- Possibility 2: we do loop peeling:
1626      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1627         x = q[i];
1628         p[i] = y;
1629      }
1630      for (i = 3; i < N; i++){   # loop 2A
1631         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1632         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1633      }
1634
1635      -- Possibility 3: combination of loop peeling and versioning:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      if (p is aligned) {
1641         for (i = 3; i<N; i++){  # loop 3A
1642           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1643           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1644         }
1645      }
1646      else {
1647         for (i = 3; i<N; i++){  # loop 3B
1648           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1649           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1650         }
1651      }
1652
1653      These loops are later passed to loop_transform to be vectorized.  The
1654      vectorizer will use the alignment information to guide the transformation
1655      (whether to generate regular loads/stores, or with special handling for
1656      misalignment).  */
1657
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   enum dr_alignment_support supportable_dr_alignment;
1664   dr_vec_info *first_store = NULL;
1665   dr_vec_info *dr0_info = NULL;
1666   struct data_reference *dr;
1667   unsigned int i, j;
1668   bool do_peeling = false;
1669   bool do_versioning = false;
1670   unsigned int npeel = 0;
1671   bool one_misalignment_known = false;
1672   bool one_misalignment_unknown = false;
1673   bool one_dr_unsupportable = false;
1674   dr_vec_info *unsupportable_dr_info = NULL;
1675   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676   unsigned possible_npeel_number = 1;
1677   tree vectype;
1678   unsigned int mis, same_align_drs_max = 0;
1679   hash_table<peel_info_hasher> peeling_htab (1);
1680
1681   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708
1709   /* (1) Peeling to force alignment.  */
1710
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722       stmt_vec_info stmt_info = dr_info->stmt;
1723
1724       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725         continue;
1726
1727       /* For interleaving, only the alignment of the first access
1728          matters.  */
1729       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731         continue;
1732
1733       /* For scatter-gather or invariant accesses there is nothing
1734          to enhance.  */
1735       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736           || integer_zerop (DR_STEP (dr)))
1737         continue;
1738
1739       /* Strided accesses perform only component accesses, alignment is
1740          irrelevant for them.  */
1741       if (STMT_VINFO_STRIDED_P (stmt_info)
1742           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743         continue;
1744
1745       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746       do_peeling = vector_alignment_reachable_p (dr_info);
1747       if (do_peeling)
1748         {
1749           if (known_alignment_for_access_p (dr_info))
1750             {
1751               unsigned int npeel_tmp = 0;
1752               bool negative = tree_int_cst_compare (DR_STEP (dr),
1753                                                     size_zero_node) < 0;
1754
1755               vectype = STMT_VINFO_VECTYPE (stmt_info);
1756               /* If known_alignment_for_access_p then we have set
1757                  DR_MISALIGNMENT which is only done if we know it at compiler
1758                  time, so it is safe to assume target alignment is constant.
1759                */
1760               unsigned int target_align =
1761                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763               mis = (negative
1764                      ? DR_MISALIGNMENT (dr_info)
1765                      : -DR_MISALIGNMENT (dr_info));
1766               if (DR_MISALIGNMENT (dr_info) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr_info) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr_info, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0_info
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0_info = dr_info;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   struct loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0_info = dr_info;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr_info = dr_info;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr_info;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr_info))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0_info = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941     }
1942
1943   peel_for_unknown_alignment.peel_info.npeel = 0;
1944   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945
1946   best_peel = peel_for_unknown_alignment;
1947
1948   peel_for_known_alignment.inside_cost = INT_MAX;
1949   peel_for_known_alignment.outside_cost = INT_MAX;
1950   peel_for_known_alignment.peel_info.count = 0;
1951   peel_for_known_alignment.peel_info.dr_info = NULL;
1952
1953   if (do_peeling && one_misalignment_known)
1954     {
1955       /* Peeling is possible, but there is no data access that is not supported
1956          unless aligned.  So we try to choose the best possible peeling from
1957          the hash table.  */
1958       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959         (&peeling_htab, loop_vinfo);
1960     }
1961
1962   /* Compare costs of peeling for known and unknown alignment. */
1963   if (peel_for_known_alignment.peel_info.dr_info != NULL
1964       && peel_for_unknown_alignment.inside_cost
1965       >= peel_for_known_alignment.inside_cost)
1966     {
1967       best_peel = peel_for_known_alignment;
1968
1969       /* If the best peeling for known alignment has NPEEL == 0, perform no
1970          peeling at all except if there is an unsupportable dr that we can
1971          align.  */
1972       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973         do_peeling = false;
1974     }
1975
1976   /* If there is an unsupportable data ref, prefer this over all choices so far
1977      since we'd have to discard a chosen peeling except when it accidentally
1978      aligned the unsupportable data ref.  */
1979   if (one_dr_unsupportable)
1980     dr0_info = unsupportable_dr_info;
1981   else if (do_peeling)
1982     {
1983       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984          TODO: Use nopeel_outside_cost or get rid of it?  */
1985       unsigned nopeel_inside_cost = 0;
1986       unsigned nopeel_outside_cost = 0;
1987
1988       stmt_vector_for_cost dummy;
1989       dummy.create (2);
1990       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991                                       &nopeel_outside_cost, &dummy, &dummy,
1992                                       0, false);
1993       dummy.release ();
1994
1995       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1996          costs will be recorded.  */
1997       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998       prologue_cost_vec.create (2);
1999       epilogue_cost_vec.create (2);
2000
2001       int dummy2;
2002       nopeel_outside_cost += vect_get_known_peeling_cost
2003         (loop_vinfo, 0, &dummy2,
2004          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005          &prologue_cost_vec, &epilogue_cost_vec);
2006
2007       prologue_cost_vec.release ();
2008       epilogue_cost_vec.release ();
2009
2010       npeel = best_peel.peel_info.npeel;
2011       dr0_info = best_peel.peel_info.dr_info;
2012
2013       /* If no peeling is not more expensive than the best peeling we
2014          have so far, don't perform any peeling.  */
2015       if (nopeel_inside_cost <= best_peel.inside_cost)
2016         do_peeling = false;
2017     }
2018
2019   if (do_peeling)
2020     {
2021       stmt_vec_info stmt_info = dr0_info->stmt;
2022       vectype = STMT_VINFO_VECTYPE (stmt_info);
2023
2024       if (known_alignment_for_access_p (dr0_info))
2025         {
2026           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027                                                 size_zero_node) < 0;
2028           if (!npeel)
2029             {
2030               /* Since it's known at compile time, compute the number of
2031                  iterations in the peeled loop (the peeling factor) for use in
2032                  updating DR_MISALIGNMENT values.  The peeling factor is the
2033                  vectorization factor minus the misalignment as an element
2034                  count.  */
2035               mis = (negative
2036                      ? DR_MISALIGNMENT (dr0_info)
2037                      : -DR_MISALIGNMENT (dr0_info));
2038               /* If known_alignment_for_access_p then we have set
2039                  DR_MISALIGNMENT which is only done if we know it at compiler
2040                  time, so it is safe to assume target alignment is constant.
2041                */
2042               unsigned int target_align =
2043                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044               npeel = ((mis & (target_align - 1))
2045                        / vect_get_scalar_dr_size (dr0_info));
2046             }
2047
2048           /* For interleaved data access every iteration accesses all the
2049              members of the group, therefore we divide the number of iterations
2050              by the group size.  */
2051           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052             npeel /= DR_GROUP_SIZE (stmt_info);
2053
2054           if (dump_enabled_p ())
2055             dump_printf_loc (MSG_NOTE, vect_location,
2056                              "Try peeling by %d\n", npeel);
2057         }
2058
2059       /* Ensure that all datarefs can be vectorized after the peel.  */
2060       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061         do_peeling = false;
2062
2063       /* Check if all datarefs are supportable and log.  */
2064       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065         {
2066           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067           if (!stat)
2068             do_peeling = false;
2069           else
2070             return stat;
2071         }
2072
2073       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2074       if (do_peeling)
2075         {
2076           unsigned max_allowed_peel
2077             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078           if (max_allowed_peel != (unsigned)-1)
2079             {
2080               unsigned max_peel = npeel;
2081               if (max_peel == 0)
2082                 {
2083                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2084                   unsigned HOST_WIDE_INT target_align_c;
2085                   if (target_align.is_constant (&target_align_c))
2086                     max_peel =
2087                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2088                   else
2089                     {
2090                       do_peeling = false;
2091                       if (dump_enabled_p ())
2092                         dump_printf_loc (MSG_NOTE, vect_location,
2093                           "Disable peeling, max peels set and vector"
2094                           " alignment unknown\n");
2095                     }
2096                 }
2097               if (max_peel > max_allowed_peel)
2098                 {
2099                   do_peeling = false;
2100                   if (dump_enabled_p ())
2101                     dump_printf_loc (MSG_NOTE, vect_location,
2102                         "Disable peeling, max peels reached: %d\n", max_peel);
2103                 }
2104             }
2105         }
2106
2107       /* Cost model #2 - if peeling may result in a remaining loop not
2108          iterating enough to be vectorized then do not peel.  Since this
2109          is a cost heuristic rather than a correctness decision, use the
2110          most likely runtime value for variable vectorization factors.  */
2111       if (do_peeling
2112           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2113         {
2114           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2115           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2116           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2117               < assumed_vf + max_peel)
2118             do_peeling = false;
2119         }
2120
2121       if (do_peeling)
2122         {
2123           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2124              If the misalignment of DR_i is identical to that of dr0 then set
2125              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2126              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2127              by the peeling factor times the element size of DR_i (MOD the
2128              vectorization factor times the size).  Otherwise, the
2129              misalignment of DR_i must be set to unknown.  */
2130           FOR_EACH_VEC_ELT (datarefs, i, dr)
2131             if (dr != dr0_info->dr)
2132               {
2133                 /* Strided accesses perform only component accesses, alignment
2134                    is irrelevant for them.  */
2135                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2136                 stmt_info = dr_info->stmt;
2137                 if (STMT_VINFO_STRIDED_P (stmt_info)
2138                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139                   continue;
2140
2141                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2142               }
2143
2144           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2145           if (npeel)
2146             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2147           else
2148             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149               = DR_MISALIGNMENT (dr0_info);
2150           SET_DR_MISALIGNMENT (dr0_info, 0);
2151           if (dump_enabled_p ())
2152             {
2153               dump_printf_loc (MSG_NOTE, vect_location,
2154                                "Alignment of access forced using peeling.\n");
2155               dump_printf_loc (MSG_NOTE, vect_location,
2156                                "Peeling for alignment will be applied.\n");
2157             }
2158
2159           /* The inside-loop cost will be accounted for in vectorizable_load
2160              and vectorizable_store correctly with adjusted alignments.
2161              Drop the body_cst_vec on the floor here.  */
2162           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2163           gcc_assert (stat);
2164           return stat;
2165         }
2166     }
2167
2168   /* (2) Versioning to force alignment.  */
2169
2170   /* Try versioning if:
2171      1) optimize loop for speed
2172      2) there is at least one unsupported misaligned data ref with an unknown
2173         misalignment, and
2174      3) all misaligned data refs with a known misalignment are supported, and
2175      4) the number of runtime alignment checks is within reason.  */
2176
2177   do_versioning =
2178         optimize_loop_nest_for_speed_p (loop)
2179         && (!loop->inner); /* FORNOW */
2180
2181   if (do_versioning)
2182     {
2183       FOR_EACH_VEC_ELT (datarefs, i, dr)
2184         {
2185           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2186           stmt_vec_info stmt_info = dr_info->stmt;
2187
2188           /* For interleaving, only the alignment of the first access
2189              matters.  */
2190           if (aligned_access_p (dr_info)
2191               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2192                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2193             continue;
2194
2195           if (STMT_VINFO_STRIDED_P (stmt_info))
2196             {
2197               /* Strided loads perform only component accesses, alignment is
2198                  irrelevant for them.  */
2199               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2200                 continue;
2201               do_versioning = false;
2202               break;
2203             }
2204
2205           supportable_dr_alignment
2206             = vect_supportable_dr_alignment (dr_info, false);
2207
2208           if (!supportable_dr_alignment)
2209             {
2210               int mask;
2211               tree vectype;
2212
2213               if (known_alignment_for_access_p (dr_info)
2214                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2215                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2216                 {
2217                   do_versioning = false;
2218                   break;
2219                 }
2220
2221               vectype = STMT_VINFO_VECTYPE (stmt_info);
2222               gcc_assert (vectype);
2223
2224               /* At present we don't support versioning for alignment
2225                  with variable VF, since there's no guarantee that the
2226                  VF is a power of two.  We could relax this if we added
2227                  a way of enforcing a power-of-two size.  */
2228               unsigned HOST_WIDE_INT size;
2229               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234
2235               /* Forcing alignment in the first iteration is no good if
2236                  we don't keep it across iterations.  For now, just disable
2237                  versioning in this case.
2238                  ?? We could actually unroll the loop to achieve the required
2239                  overall step alignment, and forcing the alignment could be
2240                  done by doing some iterations of the non-vectorized loop.  */
2241               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2242                                * DR_STEP_ALIGNMENT (dr),
2243                                DR_TARGET_ALIGNMENT (dr_info)))
2244                 {
2245                   do_versioning = false;
2246                   break;
2247                 }
2248
2249               /* The rightmost bits of an aligned address must be zeros.
2250                  Construct the mask needed for this test.  For example,
2251                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2252                  mask must be 15 = 0xf. */
2253               mask = size - 1;
2254
2255               /* FORNOW: use the same mask to test all potentially unaligned
2256                  references in the loop.  The vectorizer currently supports
2257                  a single vector size, see the reference to
2258                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2259                  vectorization factor is computed.  */
2260               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2261                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2262               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2263               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2264             }
2265         }
2266
2267       /* Versioning requires at least one misaligned data reference.  */
2268       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2269         do_versioning = false;
2270       else if (!do_versioning)
2271         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2272     }
2273
2274   if (do_versioning)
2275     {
2276       vec<stmt_vec_info> may_misalign_stmts
2277         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2278       stmt_vec_info stmt_info;
2279
2280       /* It can now be assumed that the data references in the statements
2281          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2282          of the loop being vectorized.  */
2283       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2284         {
2285           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2286           SET_DR_MISALIGNMENT (dr_info, 0);
2287           if (dump_enabled_p ())
2288             dump_printf_loc (MSG_NOTE, vect_location,
2289                              "Alignment of access forced using versioning.\n");
2290         }
2291
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_NOTE, vect_location,
2294                          "Versioning for alignment will be applied.\n");
2295
2296       /* Peeling and versioning can't be done together at this time.  */
2297       gcc_assert (! (do_peeling && do_versioning));
2298
2299       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2300       gcc_assert (stat);
2301       return stat;
2302     }
2303
2304   /* This point is reached if neither peeling nor versioning is being done.  */
2305   gcc_assert (! (do_peeling || do_versioning));
2306
2307   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2308   return stat;
2309 }
2310
2311
2312 /* Function vect_find_same_alignment_drs.
2313
2314    Update group and alignment relations in VINFO according to the chosen
2315    vectorization factor.  */
2316
2317 static void
2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2319 {
2320   struct data_reference *dra = DDR_A (ddr);
2321   struct data_reference *drb = DDR_B (ddr);
2322   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2323   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2324   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2325   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2326
2327   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2328     return;
2329
2330   if (dra == drb)
2331     return;
2332
2333   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2334       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2335     return;
2336
2337   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2338       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2339       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2340     return;
2341
2342   /* Two references with distance zero have the same alignment.  */
2343   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2344                           - wi::to_poly_offset (DR_INIT (drb)));
2345   if (maybe_ne (diff, 0))
2346     {
2347       /* Get the wider of the two alignments.  */
2348       poly_uint64 align_a =
2349         exact_div (vect_calculate_target_alignment (dr_info_a),
2350                    BITS_PER_UNIT);
2351       poly_uint64 align_b =
2352         exact_div (vect_calculate_target_alignment (dr_info_b),
2353                    BITS_PER_UNIT);
2354       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2355       if (!align_a.is_constant (&align_a_c)
2356           || !align_b.is_constant (&align_b_c))
2357         return;
2358
2359       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2360
2361       /* Require the gap to be a multiple of the larger vector alignment.  */
2362       if (!multiple_p (diff, max_align))
2363         return;
2364     }
2365
2366   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2367   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2368   if (dump_enabled_p ())
2369     dump_printf_loc (MSG_NOTE, vect_location,
2370                      "accesses have the same alignment: %T and %T\n",
2371                      DR_REF (dra), DR_REF (drb));
2372 }
2373
2374
2375 /* Function vect_analyze_data_refs_alignment
2376
2377    Analyze the alignment of the data-references in the loop.
2378    Return FALSE if a data reference is found that cannot be vectorized.  */
2379
2380 opt_result
2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2382 {
2383   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2384
2385   /* Mark groups of data references with same alignment using
2386      data dependence information.  */
2387   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2388   struct data_dependence_relation *ddr;
2389   unsigned int i;
2390
2391   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2392     vect_find_same_alignment_drs (vinfo, ddr);
2393
2394   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2395   struct data_reference *dr;
2396
2397   vect_record_base_alignments (vinfo);
2398   FOR_EACH_VEC_ELT (datarefs, i, dr)
2399     {
2400       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2401       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2402         vect_compute_data_ref_alignment (dr_info);
2403     }
2404
2405   return opt_result::success ();
2406 }
2407
2408
2409 /* Analyze alignment of DRs of stmts in NODE.  */
2410
2411 static bool
2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2413 {
2414   /* We vectorize from the first scalar stmt in the node unless
2415      the node is permuted in which case we start from the first
2416      element in the group.  */
2417   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2418   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2419   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2420     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2421
2422   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2423   vect_compute_data_ref_alignment (dr_info);
2424   /* For creating the data-ref pointer we need alignment of the
2425      first element anyway.  */
2426   if (dr_info != first_dr_info)
2427     vect_compute_data_ref_alignment (first_dr_info);
2428   if (! verify_data_ref_alignment (dr_info))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "not vectorized: bad data alignment in basic "
2433                          "block.\n");
2434       return false;
2435     }
2436
2437   return true;
2438 }
2439
2440 /* Function vect_slp_analyze_instance_alignment
2441
2442    Analyze the alignment of the data-references in the SLP instance.
2443    Return FALSE if a data reference is found that cannot be vectorized.  */
2444
2445 bool
2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2447 {
2448   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2449
2450   slp_tree node;
2451   unsigned i;
2452   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2453     if (! vect_slp_analyze_and_verify_node_alignment (node))
2454       return false;
2455
2456   node = SLP_INSTANCE_TREE (instance);
2457   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2458       && ! vect_slp_analyze_and_verify_node_alignment
2459              (SLP_INSTANCE_TREE (instance)))
2460     return false;
2461
2462   return true;
2463 }
2464
2465
2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2467    accesses of legal size, step, etc.  Detect gaps, single element
2468    interleaving, and other special cases. Set grouped access info.
2469    Collect groups of strided stores for further use in SLP analysis.
2470    Worker for vect_analyze_group_access.  */
2471
2472 static bool
2473 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2474 {
2475   data_reference *dr = dr_info->dr;
2476   tree step = DR_STEP (dr);
2477   tree scalar_type = TREE_TYPE (DR_REF (dr));
2478   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2479   stmt_vec_info stmt_info = dr_info->stmt;
2480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2482   HOST_WIDE_INT dr_step = -1;
2483   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2484   bool slp_impossible = false;
2485
2486   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2487      size of the interleaving group (including gaps).  */
2488   if (tree_fits_shwi_p (step))
2489     {
2490       dr_step = tree_to_shwi (step);
2491       /* Check that STEP is a multiple of type size.  Otherwise there is
2492          a non-element-sized gap at the end of the group which we
2493          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2494          ???  As we can handle non-constant step fine here we should
2495          simply remove uses of DR_GROUP_GAP between the last and first
2496          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2497          simply not include that gap.  */
2498       if ((dr_step % type_size) != 0)
2499         {
2500           if (dump_enabled_p ())
2501             dump_printf_loc (MSG_NOTE, vect_location,
2502                              "Step %T is not a multiple of the element size"
2503                              " for %T\n",
2504                              step, DR_REF (dr));
2505           return false;
2506         }
2507       groupsize = absu_hwi (dr_step) / type_size;
2508     }
2509   else
2510     groupsize = 0;
2511
2512   /* Not consecutive access is possible only if it is a part of interleaving.  */
2513   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2514     {
2515       /* Check if it this DR is a part of interleaving, and is a single
2516          element of the group that is accessed in the loop.  */
2517
2518       /* Gaps are supported only for loads. STEP must be a multiple of the type
2519          size.  */
2520       if (DR_IS_READ (dr)
2521           && (dr_step % type_size) == 0
2522           && groupsize > 0)
2523         {
2524           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2525           DR_GROUP_SIZE (stmt_info) = groupsize;
2526           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2527           if (dump_enabled_p ())
2528             dump_printf_loc (MSG_NOTE, vect_location,
2529                              "Detected single element interleaving %T"
2530                              " step %T\n",
2531                              DR_REF (dr), step);
2532
2533           return true;
2534         }
2535
2536       if (dump_enabled_p ())
2537         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2538                          "not consecutive access %G", stmt_info->stmt);
2539
2540       if (bb_vinfo)
2541         {
2542           /* Mark the statement as unvectorizable.  */
2543           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2544           return true;
2545         }
2546
2547       if (dump_enabled_p ())
2548         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2549       STMT_VINFO_STRIDED_P (stmt_info) = true;
2550       return true;
2551     }
2552
2553   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2554     {
2555       /* First stmt in the interleaving chain. Check the chain.  */
2556       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2557       struct data_reference *data_ref = dr;
2558       unsigned int count = 1;
2559       tree prev_init = DR_INIT (data_ref);
2560       HOST_WIDE_INT diff, gaps = 0;
2561
2562       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2563       while (next)
2564         {
2565           /* We never have the same DR multiple times.  */
2566           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2567                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2568
2569           data_ref = STMT_VINFO_DATA_REF (next);
2570
2571           /* All group members have the same STEP by construction.  */
2572           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2573
2574           /* Check that the distance between two accesses is equal to the type
2575              size. Otherwise, we have gaps.  */
2576           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2578           if (diff != 1)
2579             {
2580               /* FORNOW: SLP of accesses with gaps is not supported.  */
2581               slp_impossible = true;
2582               if (DR_IS_WRITE (data_ref))
2583                 {
2584                   if (dump_enabled_p ())
2585                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586                                      "interleaved store with gaps\n");
2587                   return false;
2588                 }
2589
2590               gaps += diff - 1;
2591             }
2592
2593           last_accessed_element += diff;
2594
2595           /* Store the gap from the previous member of the group. If there is no
2596              gap in the access, DR_GROUP_GAP is always 1.  */
2597           DR_GROUP_GAP (next) = diff;
2598
2599           prev_init = DR_INIT (data_ref);
2600           next = DR_GROUP_NEXT_ELEMENT (next);
2601           /* Count the number of data-refs in the chain.  */
2602           count++;
2603         }
2604
2605       if (groupsize == 0)
2606         groupsize = count + gaps;
2607
2608       /* This could be UINT_MAX but as we are generating code in a very
2609          inefficient way we have to cap earlier.  See PR78699 for example.  */
2610       if (groupsize > 4096)
2611         {
2612           if (dump_enabled_p ())
2613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614                              "group is too large\n");
2615           return false;
2616         }
2617
2618       /* Check that the size of the interleaving is equal to count for stores,
2619          i.e., that there are no gaps.  */
2620       if (groupsize != count
2621           && !DR_IS_READ (dr))
2622         {
2623           groupsize = count;
2624           STMT_VINFO_STRIDED_P (stmt_info) = true;
2625         }
2626
2627       /* If there is a gap after the last load in the group it is the
2628          difference between the groupsize and the last accessed
2629          element.
2630          When there is no gap, this difference should be 0.  */
2631       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2632
2633       DR_GROUP_SIZE (stmt_info) = groupsize;
2634       if (dump_enabled_p ())
2635         {
2636           dump_printf_loc (MSG_NOTE, vect_location,
2637                            "Detected interleaving ");
2638           if (DR_IS_READ (dr))
2639             dump_printf (MSG_NOTE, "load ");
2640           else if (STMT_VINFO_STRIDED_P (stmt_info))
2641             dump_printf (MSG_NOTE, "strided store ");
2642           else
2643             dump_printf (MSG_NOTE, "store ");
2644           dump_printf (MSG_NOTE, "of size %u\n",
2645                        (unsigned)groupsize);
2646           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2647           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2648           while (next)
2649             {
2650               if (DR_GROUP_GAP (next) != 1)
2651                 dump_printf_loc (MSG_NOTE, vect_location,
2652                                  "\t<gap of %d elements>\n",
2653                                  DR_GROUP_GAP (next) - 1);
2654               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2655               next = DR_GROUP_NEXT_ELEMENT (next);
2656             }
2657           if (DR_GROUP_GAP (stmt_info) != 0)
2658             dump_printf_loc (MSG_NOTE, vect_location,
2659                              "\t<gap of %d elements>\n",
2660                              DR_GROUP_GAP (stmt_info));
2661         }
2662
2663       /* SLP: create an SLP data structure for every interleaving group of
2664          stores for further analysis in vect_analyse_slp.  */
2665       if (DR_IS_WRITE (dr) && !slp_impossible)
2666         {
2667           if (loop_vinfo)
2668             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2669           if (bb_vinfo)
2670             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2671         }
2672     }
2673
2674   return true;
2675 }
2676
2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2678    accesses of legal size, step, etc.  Detect gaps, single element
2679    interleaving, and other special cases. Set grouped access info.
2680    Collect groups of strided stores for further use in SLP analysis.  */
2681
2682 static bool
2683 vect_analyze_group_access (dr_vec_info *dr_info)
2684 {
2685   if (!vect_analyze_group_access_1 (dr_info))
2686     {
2687       /* Dissolve the group if present.  */
2688       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2689       while (stmt_info)
2690         {
2691           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2693           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2694           stmt_info = next;
2695         }
2696       return false;
2697     }
2698   return true;
2699 }
2700
2701 /* Analyze the access pattern of the data-reference DR_INFO.
2702    In case of non-consecutive accesses call vect_analyze_group_access() to
2703    analyze groups of accesses.  */
2704
2705 static bool
2706 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2707 {
2708   data_reference *dr = dr_info->dr;
2709   tree step = DR_STEP (dr);
2710   tree scalar_type = TREE_TYPE (DR_REF (dr));
2711   stmt_vec_info stmt_info = dr_info->stmt;
2712   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2713   struct loop *loop = NULL;
2714
2715   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2716     return true;
2717
2718   if (loop_vinfo)
2719     loop = LOOP_VINFO_LOOP (loop_vinfo);
2720
2721   if (loop_vinfo && !step)
2722     {
2723       if (dump_enabled_p ())
2724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2725                          "bad data-ref access in loop\n");
2726       return false;
2727     }
2728
2729   /* Allow loads with zero step in inner-loop vectorization.  */
2730   if (loop_vinfo && integer_zerop (step))
2731     {
2732       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733       if (!nested_in_vect_loop_p (loop, stmt_info))
2734         return DR_IS_READ (dr);
2735       /* Allow references with zero step for outer loops marked
2736          with pragma omp simd only - it guarantees absence of
2737          loop-carried dependencies between inner loop iterations.  */
2738       if (loop->safelen < 2)
2739         {
2740           if (dump_enabled_p ())
2741             dump_printf_loc (MSG_NOTE, vect_location,
2742                              "zero step in inner loop of nest\n");
2743           return false;
2744         }
2745     }
2746
2747   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2748     {
2749       /* Interleaved accesses are not yet supported within outer-loop
2750         vectorization for references in the inner-loop.  */
2751       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752
2753       /* For the rest of the analysis we use the outer-loop step.  */
2754       step = STMT_VINFO_DR_STEP (stmt_info);
2755       if (integer_zerop (step))
2756         {
2757           if (dump_enabled_p ())
2758             dump_printf_loc (MSG_NOTE, vect_location,
2759                              "zero step in outer loop.\n");
2760           return DR_IS_READ (dr);
2761         }
2762     }
2763
2764   /* Consecutive?  */
2765   if (TREE_CODE (step) == INTEGER_CST)
2766     {
2767       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2768       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2769           || (dr_step < 0
2770               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2771         {
2772           /* Mark that it is not interleaving.  */
2773           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2774           return true;
2775         }
2776     }
2777
2778   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2779     {
2780       if (dump_enabled_p ())
2781         dump_printf_loc (MSG_NOTE, vect_location,
2782                          "grouped access in outer loop.\n");
2783       return false;
2784     }
2785
2786
2787   /* Assume this is a DR handled by non-constant strided load case.  */
2788   if (TREE_CODE (step) != INTEGER_CST)
2789     return (STMT_VINFO_STRIDED_P (stmt_info)
2790             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2791                 || vect_analyze_group_access (dr_info)));
2792
2793   /* Not consecutive access - check if it's a part of interleaving group.  */
2794   return vect_analyze_group_access (dr_info);
2795 }
2796
2797 /* Compare two data-references DRA and DRB to group them into chunks
2798    suitable for grouping.  */
2799
2800 static int
2801 dr_group_sort_cmp (const void *dra_, const void *drb_)
2802 {
2803   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2804   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2805   int cmp;
2806
2807   /* Stabilize sort.  */
2808   if (dra == drb)
2809     return 0;
2810
2811   /* DRs in different loops never belong to the same group.  */
2812   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2813   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2814   if (loopa != loopb)
2815     return loopa->num < loopb->num ? -1 : 1;
2816
2817   /* Ordering of DRs according to base.  */
2818   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2819                                DR_BASE_ADDRESS (drb));
2820   if (cmp != 0)
2821     return cmp;
2822
2823   /* And according to DR_OFFSET.  */
2824   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2825   if (cmp != 0)
2826     return cmp;
2827
2828   /* Put reads before writes.  */
2829   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2830     return DR_IS_READ (dra) ? -1 : 1;
2831
2832   /* Then sort after access size.  */
2833   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2834                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2835   if (cmp != 0)
2836     return cmp;
2837
2838   /* And after step.  */
2839   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2840   if (cmp != 0)
2841     return cmp;
2842
2843   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2844   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2845   if (cmp == 0)
2846     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2847   return cmp;
2848 }
2849
2850 /* If OP is the result of a conversion, return the unconverted value,
2851    otherwise return null.  */
2852
2853 static tree
2854 strip_conversion (tree op)
2855 {
2856   if (TREE_CODE (op) != SSA_NAME)
2857     return NULL_TREE;
2858   gimple *stmt = SSA_NAME_DEF_STMT (op);
2859   if (!is_gimple_assign (stmt)
2860       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2861     return NULL_TREE;
2862   return gimple_assign_rhs1 (stmt);
2863 }
2864
2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2866    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2867    be grouped in SLP mode.  */
2868
2869 static bool
2870 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2871                    bool allow_slp_p)
2872 {
2873   if (gimple_assign_single_p (stmt1_info->stmt))
2874     return gimple_assign_single_p (stmt2_info->stmt);
2875
2876   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2877   if (call1 && gimple_call_internal_p (call1))
2878     {
2879       /* Check for two masked loads or two masked stores.  */
2880       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2881       if (!call2 || !gimple_call_internal_p (call2))
2882         return false;
2883       internal_fn ifn = gimple_call_internal_fn (call1);
2884       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885         return false;
2886       if (ifn != gimple_call_internal_fn (call2))
2887         return false;
2888
2889       /* Check that the masks are the same.  Cope with casts of masks,
2890          like those created by build_mask_conversion.  */
2891       tree mask1 = gimple_call_arg (call1, 2);
2892       tree mask2 = gimple_call_arg (call2, 2);
2893       if (!operand_equal_p (mask1, mask2, 0)
2894           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2895         {
2896           mask1 = strip_conversion (mask1);
2897           if (!mask1)
2898             return false;
2899           mask2 = strip_conversion (mask2);
2900           if (!mask2)
2901             return false;
2902           if (!operand_equal_p (mask1, mask2, 0))
2903             return false;
2904         }
2905       return true;
2906     }
2907
2908   return false;
2909 }
2910
2911 /* Function vect_analyze_data_ref_accesses.
2912
2913    Analyze the access pattern of all the data references in the loop.
2914
2915    FORNOW: the only access pattern that is considered vectorizable is a
2916            simple step 1 (consecutive) access.
2917
2918    FORNOW: handle only arrays and pointer accesses.  */
2919
2920 opt_result
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 {
2923   unsigned int i;
2924   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925   struct data_reference *dr;
2926
2927   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928
2929   if (datarefs.is_empty ())
2930     return opt_result::success ();
2931
2932   /* Sort the array of datarefs to make building the interleaving chains
2933      linear.  Don't modify the original vector's order, it is needed for
2934      determining what dependencies are reversed.  */
2935   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936   datarefs_copy.qsort (dr_group_sort_cmp);
2937   hash_set<stmt_vec_info> to_fixup;
2938
2939   /* Build the interleaving chains.  */
2940   for (i = 0; i < datarefs_copy.length () - 1;)
2941     {
2942       data_reference_p dra = datarefs_copy[i];
2943       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2944       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2945       stmt_vec_info lastinfo = NULL;
2946       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2947           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2948         {
2949           ++i;
2950           continue;
2951         }
2952       for (i = i + 1; i < datarefs_copy.length (); ++i)
2953         {
2954           data_reference_p drb = datarefs_copy[i];
2955           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2956           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2957           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2958               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2959             break;
2960
2961           /* ???  Imperfect sorting (non-compatible types, non-modulo
2962              accesses, same accesses) can lead to a group to be artificially
2963              split here as we don't just skip over those.  If it really
2964              matters we can push those to a worklist and re-iterate
2965              over them.  The we can just skip ahead to the next DR here.  */
2966
2967           /* DRs in a different loop should not be put into the same
2968              interleaving group.  */
2969           if (gimple_bb (DR_STMT (dra))->loop_father
2970               != gimple_bb (DR_STMT (drb))->loop_father)
2971             break;
2972
2973           /* Check that the data-refs have same first location (except init)
2974              and they are both either store or load (not load and store,
2975              not masked loads or stores).  */
2976           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2977               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2978                                         DR_BASE_ADDRESS (drb)) != 0
2979               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2980               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2981             break;
2982
2983           /* Check that the data-refs have the same constant size.  */
2984           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2985           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2986           if (!tree_fits_uhwi_p (sza)
2987               || !tree_fits_uhwi_p (szb)
2988               || !tree_int_cst_equal (sza, szb))
2989             break;
2990
2991           /* Check that the data-refs have the same step.  */
2992           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2993             break;
2994
2995           /* Check the types are compatible.
2996              ???  We don't distinguish this during sorting.  */
2997           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2998                                    TREE_TYPE (DR_REF (drb))))
2999             break;
3000
3001           /* Check that the DR_INITs are compile-time constants.  */
3002           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3003               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3004             break;
3005
3006           /* Different .GOMP_SIMD_LANE calls still give the same lane,
3007              just hold extra information.  */
3008           if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3009               && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3010               && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3011             break;
3012
3013           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3014           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3015           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3016           HOST_WIDE_INT init_prev
3017             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3018           gcc_assert (init_a <= init_b
3019                       && init_a <= init_prev
3020                       && init_prev <= init_b);
3021
3022           /* Do not place the same access in the interleaving chain twice.  */
3023           if (init_b == init_prev)
3024             {
3025               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3026                           < gimple_uid (DR_STMT (drb)));
3027               /* Simply link in duplicates and fix up the chain below.  */
3028             }
3029           else
3030             {
3031               /* If init_b == init_a + the size of the type * k, we have an
3032                  interleaving, and DRA is accessed before DRB.  */
3033               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3034               if (type_size_a == 0
3035                   || (init_b - init_a) % type_size_a != 0)
3036                 break;
3037
3038               /* If we have a store, the accesses are adjacent.  This splits
3039                  groups into chunks we support (we don't support vectorization
3040                  of stores with gaps).  */
3041               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3042                 break;
3043
3044               /* If the step (if not zero or non-constant) is greater than the
3045                  difference between data-refs' inits this splits groups into
3046                  suitable sizes.  */
3047               if (tree_fits_shwi_p (DR_STEP (dra)))
3048                 {
3049                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3050                   if (step != 0 && step <= (init_b - init_a))
3051                     break;
3052                 }
3053             }
3054
3055           if (dump_enabled_p ())
3056             dump_printf_loc (MSG_NOTE, vect_location,
3057                              DR_IS_READ (dra)
3058                              ? "Detected interleaving load %T and %T\n"
3059                              : "Detected interleaving store %T and %T\n",
3060                              DR_REF (dra), DR_REF (drb));
3061
3062           /* Link the found element into the group list.  */
3063           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3064             {
3065               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3066               lastinfo = stmtinfo_a;
3067             }
3068           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3069           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3070           lastinfo = stmtinfo_b;
3071
3072           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3073             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3074
3075           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3076             dump_printf_loc (MSG_NOTE, vect_location,
3077                              "Load suitable for SLP vectorization only.\n");
3078
3079           if (init_b == init_prev
3080               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3081               && dump_enabled_p ())
3082             dump_printf_loc (MSG_NOTE, vect_location,
3083                              "Queuing group with duplicate access for fixup\n");
3084         }
3085     }
3086
3087   /* Fixup groups with duplicate entries by splitting it.  */
3088   while (1)
3089     {
3090       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3091       if (!(it != to_fixup.end ()))
3092         break;
3093       stmt_vec_info grp = *it;
3094       to_fixup.remove (grp);
3095
3096       /* Find the earliest duplicate group member.  */
3097       unsigned first_duplicate = -1u;
3098       stmt_vec_info next, g = grp;
3099       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3100         {
3101           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3102                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3103               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3104             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3105           g = next;
3106         }
3107       if (first_duplicate == -1U)
3108         continue;
3109
3110       /* Then move all stmts after the first duplicate to a new group.
3111          Note this is a heuristic but one with the property that *it
3112          is fixed up completely.  */
3113       g = grp;
3114       stmt_vec_info newgroup = NULL, ng = grp;
3115       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3116         {
3117           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3118             {
3119               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3120               if (!newgroup)
3121                 newgroup = next;
3122               else
3123                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3124               ng = next;
3125               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3126             }
3127           else
3128             g = DR_GROUP_NEXT_ELEMENT (g);
3129         }
3130       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3131
3132       /* Fixup the new group which still may contain duplicates.  */
3133       to_fixup.add (newgroup);
3134     }
3135
3136   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3137     {
3138       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3139       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3140           && !vect_analyze_data_ref_access (dr_info))
3141         {
3142           if (dump_enabled_p ())
3143             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144                              "not vectorized: complicated access pattern.\n");
3145
3146           if (is_a <bb_vec_info> (vinfo))
3147             {
3148               /* Mark the statement as not vectorizable.  */
3149               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3150               continue;
3151             }
3152           else
3153             {
3154               datarefs_copy.release ();
3155               return opt_result::failure_at (dr_info->stmt->stmt,
3156                                              "not vectorized:"
3157                                              " complicated access pattern.\n");
3158             }
3159         }
3160     }
3161
3162   datarefs_copy.release ();
3163   return opt_result::success ();
3164 }
3165
3166 /* Function vect_vfa_segment_size.
3167
3168    Input:
3169      DR_INFO: The data reference.
3170      LENGTH_FACTOR: segment length to consider.
3171
3172    Return a value suitable for the dr_with_seg_len::seg_len field.
3173    This is the "distance travelled" by the pointer from the first
3174    iteration in the segment to the last.  Note that it does not include
3175    the size of the access; in effect it only describes the first byte.  */
3176
3177 static tree
3178 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3179 {
3180   length_factor = size_binop (MINUS_EXPR,
3181                               fold_convert (sizetype, length_factor),
3182                               size_one_node);
3183   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3184                      length_factor);
3185 }
3186
3187 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3188    gives the worst-case number of bytes covered by the segment.  */
3189
3190 static unsigned HOST_WIDE_INT
3191 vect_vfa_access_size (dr_vec_info *dr_info)
3192 {
3193   stmt_vec_info stmt_vinfo = dr_info->stmt;
3194   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3195   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3196   unsigned HOST_WIDE_INT access_size = ref_size;
3197   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3198     {
3199       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3200       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3201     }
3202   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3203       && (vect_supportable_dr_alignment (dr_info, false)
3204           == dr_explicit_realign_optimized))
3205     {
3206       /* We might access a full vector's worth.  */
3207       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3208       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3209     }
3210   return access_size;
3211 }
3212
3213 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3214    describes.  */
3215
3216 static unsigned int
3217 vect_vfa_align (dr_vec_info *dr_info)
3218 {
3219   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3220 }
3221
3222 /* Function vect_no_alias_p.
3223
3224    Given data references A and B with equal base and offset, see whether
3225    the alias relation can be decided at compilation time.  Return 1 if
3226    it can and the references alias, 0 if it can and the references do
3227    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3228    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3229    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3230
3231 static int
3232 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3233                          tree segment_length_a, tree segment_length_b,
3234                          unsigned HOST_WIDE_INT access_size_a,
3235                          unsigned HOST_WIDE_INT access_size_b)
3236 {
3237   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3238   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3239   poly_uint64 const_length_a;
3240   poly_uint64 const_length_b;
3241
3242   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3243      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3244      [a, a+12) */
3245   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3246     {
3247       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3248       offset_a = (offset_a + access_size_a) - const_length_a;
3249     }
3250   else
3251     const_length_a = tree_to_poly_uint64 (segment_length_a);
3252   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3253     {
3254       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3255       offset_b = (offset_b + access_size_b) - const_length_b;
3256     }
3257   else
3258     const_length_b = tree_to_poly_uint64 (segment_length_b);
3259
3260   const_length_a += access_size_a;
3261   const_length_b += access_size_b;
3262
3263   if (ranges_known_overlap_p (offset_a, const_length_a,
3264                               offset_b, const_length_b))
3265     return 1;
3266
3267   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3268                                offset_b, const_length_b))
3269     return 0;
3270
3271   return -1;
3272 }
3273
3274 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3275    in DDR is >= VF.  */
3276
3277 static bool
3278 dependence_distance_ge_vf (data_dependence_relation *ddr,
3279                            unsigned int loop_depth, poly_uint64 vf)
3280 {
3281   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3282       || DDR_NUM_DIST_VECTS (ddr) == 0)
3283     return false;
3284
3285   /* If the dependence is exact, we should have limited the VF instead.  */
3286   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3287
3288   unsigned int i;
3289   lambda_vector dist_v;
3290   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3291     {
3292       HOST_WIDE_INT dist = dist_v[loop_depth];
3293       if (dist != 0
3294           && !(dist > 0 && DDR_REVERSED_P (ddr))
3295           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3296         return false;
3297     }
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "dependence distance between %T and %T is >= VF\n",
3302                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3303
3304   return true;
3305 }
3306
3307 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3308
3309 static void
3310 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3311 {
3312   dump_printf (dump_kind, "%s (%T) >= ",
3313                lower_bound.unsigned_p ? "unsigned" : "abs",
3314                lower_bound.expr);
3315   dump_dec (dump_kind, lower_bound.min_value);
3316 }
3317
3318 /* Record that the vectorized loop requires the vec_lower_bound described
3319    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3320
3321 static void
3322 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3323                         poly_uint64 min_value)
3324 {
3325   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3326   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3327     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3328       {
3329         unsigned_p &= lower_bounds[i].unsigned_p;
3330         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3331         if (lower_bounds[i].unsigned_p != unsigned_p
3332             || maybe_lt (lower_bounds[i].min_value, min_value))
3333           {
3334             lower_bounds[i].unsigned_p = unsigned_p;
3335             lower_bounds[i].min_value = min_value;
3336             if (dump_enabled_p ())
3337               {
3338                 dump_printf_loc (MSG_NOTE, vect_location,
3339                                  "updating run-time check to ");
3340                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3341                 dump_printf (MSG_NOTE, "\n");
3342               }
3343           }
3344         return;
3345       }
3346
3347   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3348   if (dump_enabled_p ())
3349     {
3350       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3351       dump_lower_bound (MSG_NOTE, lower_bound);
3352       dump_printf (MSG_NOTE, "\n");
3353     }
3354   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3355 }
3356
3357 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3358    will span fewer than GAP bytes.  */
3359
3360 static bool
3361 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3362                   poly_int64 gap)
3363 {
3364   stmt_vec_info stmt_info = dr_info->stmt;
3365   HOST_WIDE_INT count
3366     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3367   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3368     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3369   return (estimated_poly_value (gap)
3370           <= count * vect_get_scalar_dr_size (dr_info));
3371 }
3372
3373 /* Return true if we know that there is no alias between DR_INFO_A and
3374    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3375    When returning true, set *LOWER_BOUND_OUT to this N.  */
3376
3377 static bool
3378 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3379                                 poly_uint64 *lower_bound_out)
3380 {
3381   /* Check that there is a constant gap of known sign between DR_A
3382      and DR_B.  */
3383   data_reference *dr_a = dr_info_a->dr;
3384   data_reference *dr_b = dr_info_b->dr;
3385   poly_int64 init_a, init_b;
3386   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3387       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3388       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3389       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3390       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3391       || !ordered_p (init_a, init_b))
3392     return false;
3393
3394   /* Sort DR_A and DR_B by the address they access.  */
3395   if (maybe_lt (init_b, init_a))
3396     {
3397       std::swap (init_a, init_b);
3398       std::swap (dr_info_a, dr_info_b);
3399       std::swap (dr_a, dr_b);
3400     }
3401
3402   /* If the two accesses could be dependent within a scalar iteration,
3403      make sure that we'd retain their order.  */
3404   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3405       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3406     return false;
3407
3408   /* There is no alias if abs (DR_STEP) is greater than or equal to
3409      the bytes spanned by the combination of the two accesses.  */
3410   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3411   return true;
3412 }
3413
3414 /* Function vect_prune_runtime_alias_test_list.
3415
3416    Prune a list of ddrs to be tested at run-time by versioning for alias.
3417    Merge several alias checks into one if possible.
3418    Return FALSE if resulting list of ddrs is longer then allowed by
3419    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3420
3421 opt_result
3422 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3423 {
3424   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3425   hash_set <tree_pair_hash> compared_objects;
3426
3427   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3428   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3429     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3430   vec<vec_object_pair> &check_unequal_addrs
3431     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3432   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3433   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3434
3435   ddr_p ddr;
3436   unsigned int i;
3437   tree length_factor;
3438
3439   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3440
3441   /* Step values are irrelevant for aliasing if the number of vector
3442      iterations is equal to the number of scalar iterations (which can
3443      happen for fully-SLP loops).  */
3444   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3445
3446   if (!ignore_step_p)
3447     {
3448       /* Convert the checks for nonzero steps into bound tests.  */
3449       tree value;
3450       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3451         vect_check_lower_bound (loop_vinfo, value, true, 1);
3452     }
3453
3454   if (may_alias_ddrs.is_empty ())
3455     return opt_result::success ();
3456
3457   comp_alias_ddrs.create (may_alias_ddrs.length ());
3458
3459   unsigned int loop_depth
3460     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3461                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3462
3463   /* First, we collect all data ref pairs for aliasing checks.  */
3464   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3465     {
3466       int comp_res;
3467       poly_uint64 lower_bound;
3468       tree segment_length_a, segment_length_b;
3469       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3470       unsigned int align_a, align_b;
3471
3472       /* Ignore the alias if the VF we chose ended up being no greater
3473          than the dependence distance.  */
3474       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3475         continue;
3476
3477       if (DDR_OBJECT_A (ddr))
3478         {
3479           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3480           if (!compared_objects.add (new_pair))
3481             {
3482               if (dump_enabled_p ())
3483                 dump_printf_loc (MSG_NOTE, vect_location,
3484                                  "checking that %T and %T"
3485                                  " have different addresses\n",
3486                                  new_pair.first, new_pair.second);
3487               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3488             }
3489           continue;
3490         }
3491
3492       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3493       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3494
3495       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3496       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3497
3498       /* Skip the pair if inter-iteration dependencies are irrelevant
3499          and intra-iteration dependencies are guaranteed to be honored.  */
3500       if (ignore_step_p
3501           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3502               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3503                                                  &lower_bound)))
3504         {
3505           if (dump_enabled_p ())
3506             dump_printf_loc (MSG_NOTE, vect_location,
3507                              "no need for alias check between "
3508                              "%T and %T when VF is 1\n",
3509                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3510           continue;
3511         }
3512
3513       /* See whether we can handle the alias using a bounds check on
3514          the step, and whether that's likely to be the best approach.
3515          (It might not be, for example, if the minimum step is much larger
3516          than the number of bytes handled by one vector iteration.)  */
3517       if (!ignore_step_p
3518           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3519           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3520                                              &lower_bound)
3521           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3522               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3523         {
3524           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3525           if (dump_enabled_p ())
3526             {
3527               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3528                                "%T and %T when the step %T is outside ",
3529                                DR_REF (dr_info_a->dr),
3530                                DR_REF (dr_info_b->dr),
3531                                DR_STEP (dr_info_a->dr));
3532               if (unsigned_p)
3533                 dump_printf (MSG_NOTE, "[0");
3534               else
3535                 {
3536                   dump_printf (MSG_NOTE, "(");
3537                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3538                 }
3539               dump_printf (MSG_NOTE, ", ");
3540               dump_dec (MSG_NOTE, lower_bound);
3541               dump_printf (MSG_NOTE, ")\n");
3542             }
3543           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3544                                   unsigned_p, lower_bound);
3545           continue;
3546         }
3547
3548       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3549       if (dr_group_first_a)
3550         {
3551           stmt_info_a = dr_group_first_a;
3552           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3553         }
3554
3555       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3556       if (dr_group_first_b)
3557         {
3558           stmt_info_b = dr_group_first_b;
3559           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3560         }
3561
3562       if (ignore_step_p)
3563         {
3564           segment_length_a = size_zero_node;
3565           segment_length_b = size_zero_node;
3566         }
3567       else
3568         {
3569           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3570                                 DR_STEP (dr_info_b->dr), 0))
3571             length_factor = scalar_loop_iters;
3572           else
3573             length_factor = size_int (vect_factor);
3574           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3575           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3576         }
3577       access_size_a = vect_vfa_access_size (dr_info_a);
3578       access_size_b = vect_vfa_access_size (dr_info_b);
3579       align_a = vect_vfa_align (dr_info_a);
3580       align_b = vect_vfa_align (dr_info_b);
3581
3582       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3583                                         DR_BASE_ADDRESS (dr_info_b->dr));
3584       if (comp_res == 0)
3585         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3586                                           DR_OFFSET (dr_info_b->dr));
3587
3588       /* See whether the alias is known at compilation time.  */
3589       if (comp_res == 0
3590           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3591           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3592           && poly_int_tree_p (segment_length_a)
3593           && poly_int_tree_p (segment_length_b))
3594         {
3595           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3596                                              segment_length_a,
3597                                              segment_length_b,
3598                                              access_size_a,
3599                                              access_size_b);
3600           if (res >= 0 && dump_enabled_p ())
3601             {
3602               dump_printf_loc (MSG_NOTE, vect_location,
3603                                "can tell at compile time that %T and %T",
3604                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3605               if (res == 0)
3606                 dump_printf (MSG_NOTE, " do not alias\n");
3607               else
3608                 dump_printf (MSG_NOTE, " alias\n");
3609             }
3610
3611           if (res == 0)
3612             continue;
3613
3614           if (res == 1)
3615             return opt_result::failure_at (stmt_info_b->stmt,
3616                                            "not vectorized:"
3617                                            " compilation time alias: %G%G",
3618                                            stmt_info_a->stmt,
3619                                            stmt_info_b->stmt);
3620         }
3621
3622       dr_with_seg_len_pair_t dr_with_seg_len_pair
3623         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3624                           access_size_a, align_a),
3625          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3626                           access_size_b, align_b));
3627
3628       /* Canonicalize pairs by sorting the two DR members.  */
3629       if (comp_res > 0)
3630         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3631
3632       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3633     }
3634
3635   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3636
3637   unsigned int count = (comp_alias_ddrs.length ()
3638                         + check_unequal_addrs.length ());
3639
3640   if (dump_enabled_p ())
3641     dump_printf_loc (MSG_NOTE, vect_location,
3642                      "improved number of alias checks from %d to %d\n",
3643                      may_alias_ddrs.length (), count);
3644   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3645     return opt_result::failure_at
3646       (vect_location,
3647        "number of versioning for alias "
3648        "run-time tests exceeds %d "
3649        "(--param vect-max-version-for-alias-checks)\n",
3650        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3651
3652   return opt_result::success ();
3653 }
3654
3655 /* Check whether we can use an internal function for a gather load
3656    or scatter store.  READ_P is true for loads and false for stores.
3657    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3658    the type of the memory elements being loaded or stored.  OFFSET_BITS
3659    is the number of bits in each scalar offset and OFFSET_SIGN is the
3660    sign of the offset.  SCALE is the amount by which the offset should
3661    be multiplied *after* it has been converted to address width.
3662
3663    Return true if the function is supported, storing the function
3664    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3665
3666 bool
3667 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3668                           tree memory_type, unsigned int offset_bits,
3669                           signop offset_sign, int scale,
3670                           internal_fn *ifn_out, tree *element_type_out)
3671 {
3672   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3673   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3674   if (offset_bits > element_bits)
3675     /* Internal functions require the offset to be the same width as
3676        the vector elements.  We can extend narrower offsets, but it isn't
3677        safe to truncate wider offsets.  */
3678     return false;
3679
3680   if (element_bits != memory_bits)
3681     /* For now the vector elements must be the same width as the
3682        memory elements.  */
3683     return false;
3684
3685   /* Work out which function we need.  */
3686   internal_fn ifn;
3687   if (read_p)
3688     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3689   else
3690     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3691
3692   /* Test whether the target supports this combination.  */
3693   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3694                                                offset_sign, scale))
3695     return false;
3696
3697   *ifn_out = ifn;
3698   *element_type_out = TREE_TYPE (vectype);
3699   return true;
3700 }
3701
3702 /* STMT_INFO is a call to an internal gather load or scatter store function.
3703    Describe the operation in INFO.  */
3704
3705 static void
3706 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3707                                    gather_scatter_info *info)
3708 {
3709   gcall *call = as_a <gcall *> (stmt_info->stmt);
3710   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3711   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3712
3713   info->ifn = gimple_call_internal_fn (call);
3714   info->decl = NULL_TREE;
3715   info->base = gimple_call_arg (call, 0);
3716   info->offset = gimple_call_arg (call, 1);
3717   info->offset_dt = vect_unknown_def_type;
3718   info->offset_vectype = NULL_TREE;
3719   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3720   info->element_type = TREE_TYPE (vectype);
3721   info->memory_type = TREE_TYPE (DR_REF (dr));
3722 }
3723
3724 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3725    gather load or scatter store.  Describe the operation in *INFO if so.  */
3726
3727 bool
3728 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3729                            gather_scatter_info *info)
3730 {
3731   HOST_WIDE_INT scale = 1;
3732   poly_int64 pbitpos, pbitsize;
3733   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3734   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3735   tree offtype = NULL_TREE;
3736   tree decl = NULL_TREE, base, off;
3737   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3738   tree memory_type = TREE_TYPE (DR_REF (dr));
3739   machine_mode pmode;
3740   int punsignedp, reversep, pvolatilep = 0;
3741   internal_fn ifn;
3742   tree element_type;
3743   bool masked_p = false;
3744
3745   /* See whether this is already a call to a gather/scatter internal function.
3746      If not, see whether it's a masked load or store.  */
3747   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3748   if (call && gimple_call_internal_p (call))
3749     {
3750       ifn = gimple_call_internal_fn (call);
3751       if (internal_gather_scatter_fn_p (ifn))
3752         {
3753           vect_describe_gather_scatter_call (stmt_info, info);
3754           return true;
3755         }
3756       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3757     }
3758
3759   /* True if we should aim to use internal functions rather than
3760      built-in functions.  */
3761   bool use_ifn_p = (DR_IS_READ (dr)
3762                     ? supports_vec_gather_load_p ()
3763                     : supports_vec_scatter_store_p ());
3764
3765   base = DR_REF (dr);
3766   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3767      see if we can use the def stmt of the address.  */
3768   if (masked_p
3769       && TREE_CODE (base) == MEM_REF
3770       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3771       && integer_zerop (TREE_OPERAND (base, 1))
3772       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3773     {
3774       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3775       if (is_gimple_assign (def_stmt)
3776           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3777         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3778     }
3779
3780   /* The gather and scatter builtins need address of the form
3781      loop_invariant + vector * {1, 2, 4, 8}
3782      or
3783      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3784      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3785      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3786      multiplications and additions in it.  To get a vector, we need
3787      a single SSA_NAME that will be defined in the loop and will
3788      contain everything that is not loop invariant and that can be
3789      vectorized.  The following code attempts to find such a preexistng
3790      SSA_NAME OFF and put the loop invariants into a tree BASE
3791      that can be gimplified before the loop.  */
3792   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3793                               &punsignedp, &reversep, &pvolatilep);
3794   if (reversep)
3795     return false;
3796
3797   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3798
3799   if (TREE_CODE (base) == MEM_REF)
3800     {
3801       if (!integer_zerop (TREE_OPERAND (base, 1)))
3802         {
3803           if (off == NULL_TREE)
3804             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3805           else
3806             off = size_binop (PLUS_EXPR, off,
3807                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3808         }
3809       base = TREE_OPERAND (base, 0);
3810     }
3811   else
3812     base = build_fold_addr_expr (base);
3813
3814   if (off == NULL_TREE)
3815     off = size_zero_node;
3816
3817   /* If base is not loop invariant, either off is 0, then we start with just
3818      the constant offset in the loop invariant BASE and continue with base
3819      as OFF, otherwise give up.
3820      We could handle that case by gimplifying the addition of base + off
3821      into some SSA_NAME and use that as off, but for now punt.  */
3822   if (!expr_invariant_in_loop_p (loop, base))
3823     {
3824       if (!integer_zerop (off))
3825         return false;
3826       off = base;
3827       base = size_int (pbytepos);
3828     }
3829   /* Otherwise put base + constant offset into the loop invariant BASE
3830      and continue with OFF.  */
3831   else
3832     {
3833       base = fold_convert (sizetype, base);
3834       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3835     }
3836
3837   /* OFF at this point may be either a SSA_NAME or some tree expression
3838      from get_inner_reference.  Try to peel off loop invariants from it
3839      into BASE as long as possible.  */
3840   STRIP_NOPS (off);
3841   while (offtype == NULL_TREE)
3842     {
3843       enum tree_code code;
3844       tree op0, op1, add = NULL_TREE;
3845
3846       if (TREE_CODE (off) == SSA_NAME)
3847         {
3848           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3849
3850           if (expr_invariant_in_loop_p (loop, off))
3851             return false;
3852
3853           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3854             break;
3855
3856           op0 = gimple_assign_rhs1 (def_stmt);
3857           code = gimple_assign_rhs_code (def_stmt);
3858           op1 = gimple_assign_rhs2 (def_stmt);
3859         }
3860       else
3861         {
3862           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3863             return false;
3864           code = TREE_CODE (off);
3865           extract_ops_from_tree (off, &code, &op0, &op1);
3866         }
3867       switch (code)
3868         {
3869         case POINTER_PLUS_EXPR:
3870         case PLUS_EXPR:
3871           if (expr_invariant_in_loop_p (loop, op0))
3872             {
3873               add = op0;
3874               off = op1;
3875             do_add:
3876               add = fold_convert (sizetype, add);
3877               if (scale != 1)
3878                 add = size_binop (MULT_EXPR, add, size_int (scale));
3879               base = size_binop (PLUS_EXPR, base, add);
3880               continue;
3881             }
3882           if (expr_invariant_in_loop_p (loop, op1))
3883             {
3884               add = op1;
3885               off = op0;
3886               goto do_add;
3887             }
3888           break;
3889         case MINUS_EXPR:
3890           if (expr_invariant_in_loop_p (loop, op1))
3891             {
3892               add = fold_convert (sizetype, op1);
3893               add = size_binop (MINUS_EXPR, size_zero_node, add);
3894               off = op0;
3895               goto do_add;
3896             }
3897           break;
3898         case MULT_EXPR:
3899           if (scale == 1 && tree_fits_shwi_p (op1))
3900             {
3901               int new_scale = tree_to_shwi (op1);
3902               /* Only treat this as a scaling operation if the target
3903                  supports it.  */
3904               if (use_ifn_p
3905                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3906                                                 vectype, memory_type, 1,
3907                                                 TYPE_SIGN (TREE_TYPE (op0)),
3908                                                 new_scale, &ifn,
3909                                                 &element_type))
3910                 break;
3911               scale = new_scale;
3912               off = op0;
3913               continue;
3914             }
3915           break;
3916         case SSA_NAME:
3917           off = op0;
3918           continue;
3919         CASE_CONVERT:
3920           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3921               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3922             break;
3923           if (TYPE_PRECISION (TREE_TYPE (op0))
3924               == TYPE_PRECISION (TREE_TYPE (off)))
3925             {
3926               off = op0;
3927               continue;
3928             }
3929
3930           /* The internal functions need the offset to be the same width
3931              as the elements of VECTYPE.  Don't include operations that
3932              cast the offset from that width to a different width.  */
3933           if (use_ifn_p
3934               && (int_size_in_bytes (TREE_TYPE (vectype))
3935                   == int_size_in_bytes (TREE_TYPE (off))))
3936             break;
3937
3938           if (TYPE_PRECISION (TREE_TYPE (op0))
3939               < TYPE_PRECISION (TREE_TYPE (off)))
3940             {
3941               off = op0;
3942               offtype = TREE_TYPE (off);
3943               STRIP_NOPS (off);
3944               continue;
3945             }
3946           break;
3947         default:
3948           break;
3949         }
3950       break;
3951     }
3952
3953   /* If at the end OFF still isn't a SSA_NAME or isn't
3954      defined in the loop, punt.  */
3955   if (TREE_CODE (off) != SSA_NAME
3956       || expr_invariant_in_loop_p (loop, off))
3957     return false;
3958
3959   if (offtype == NULL_TREE)
3960     offtype = TREE_TYPE (off);
3961
3962   if (use_ifn_p)
3963     {
3964       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3965                                      memory_type, TYPE_PRECISION (offtype),
3966                                      TYPE_SIGN (offtype), scale, &ifn,
3967                                      &element_type))
3968         return false;
3969     }
3970   else
3971     {
3972       if (DR_IS_READ (dr))
3973         {
3974           if (targetm.vectorize.builtin_gather)
3975             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3976         }
3977       else
3978         {
3979           if (targetm.vectorize.builtin_scatter)
3980             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3981         }
3982
3983       if (!decl)
3984         return false;
3985
3986       ifn = IFN_LAST;
3987       element_type = TREE_TYPE (vectype);
3988     }
3989
3990   info->ifn = ifn;
3991   info->decl = decl;
3992   info->base = base;
3993   info->offset = off;
3994   info->offset_dt = vect_unknown_def_type;
3995   info->offset_vectype = NULL_TREE;
3996   info->scale = scale;
3997   info->element_type = element_type;
3998   info->memory_type = memory_type;
3999   return true;
4000 }
4001
4002 /* Find the data references in STMT, analyze them with respect to LOOP and
4003    append them to DATAREFS.  Return false if datarefs in this stmt cannot
4004    be handled.  */
4005
4006 opt_result
4007 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4008                                vec<data_reference_p> *datarefs)
4009 {
4010   /* We can ignore clobbers for dataref analysis - they are removed during
4011      loop vectorization and BB vectorization checks dependences with a
4012      stmt walk.  */
4013   if (gimple_clobber_p (stmt))
4014     return opt_result::success ();
4015
4016   if (gimple_has_volatile_ops (stmt))
4017     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4018                                    stmt);
4019
4020   if (stmt_can_throw_internal (cfun, stmt))
4021     return opt_result::failure_at (stmt,
4022                                    "not vectorized:"
4023                                    " statement can throw an exception: %G",
4024                                    stmt);
4025
4026   auto_vec<data_reference_p, 2> refs;
4027   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4028   if (!res)
4029     return res;
4030
4031   if (refs.is_empty ())
4032     return opt_result::success ();
4033
4034   if (refs.length () > 1)
4035     return opt_result::failure_at (stmt,
4036                                    "not vectorized:"
4037                                    " more than one data ref in stmt: %G", stmt);
4038
4039   if (gcall *call = dyn_cast <gcall *> (stmt))
4040     if (!gimple_call_internal_p (call)
4041         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4042             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4043       return opt_result::failure_at (stmt,
4044                                      "not vectorized: dr in a call %G", stmt);
4045
4046   data_reference_p dr = refs.pop ();
4047   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4048       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4049     return opt_result::failure_at (stmt,
4050                                    "not vectorized:"
4051                                    " statement is bitfield access %G", stmt);
4052
4053   if (DR_BASE_ADDRESS (dr)
4054       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4055     return opt_result::failure_at (stmt,
4056                                    "not vectorized:"
4057                                    " base addr of dr is a constant\n");
4058
4059   /* Check whether this may be a SIMD lane access and adjust the
4060      DR to make it easier for us to handle it.  */
4061   if (loop
4062       && loop->simduid
4063       && (!DR_BASE_ADDRESS (dr)
4064           || !DR_OFFSET (dr)
4065           || !DR_INIT (dr)
4066           || !DR_STEP (dr)))
4067     {
4068       struct data_reference *newdr
4069         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4070                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4071       if (DR_BASE_ADDRESS (newdr)
4072           && DR_OFFSET (newdr)
4073           && DR_INIT (newdr)
4074           && DR_STEP (newdr)
4075           && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4076           && integer_zerop (DR_STEP (newdr)))
4077         {
4078           tree base_address = DR_BASE_ADDRESS (newdr);
4079           tree off = DR_OFFSET (newdr);
4080           tree step = ssize_int (1);
4081           if (integer_zerop (off)
4082               && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4083             {
4084               off = TREE_OPERAND (base_address, 1);
4085               base_address = TREE_OPERAND (base_address, 0);
4086             }
4087           STRIP_NOPS (off);
4088           if (TREE_CODE (off) == MULT_EXPR
4089               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4090             {
4091               step = TREE_OPERAND (off, 1);
4092               off = TREE_OPERAND (off, 0);
4093               STRIP_NOPS (off);
4094             }
4095           if (CONVERT_EXPR_P (off)
4096               && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4097                   < TYPE_PRECISION (TREE_TYPE (off))))
4098             off = TREE_OPERAND (off, 0);
4099           if (TREE_CODE (off) == SSA_NAME)
4100             {
4101               gimple *def = SSA_NAME_DEF_STMT (off);
4102               /* Look through widening conversion.  */
4103               if (is_gimple_assign (def)
4104                   && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4105                 {
4106                   tree rhs1 = gimple_assign_rhs1 (def);
4107                   if (TREE_CODE (rhs1) == SSA_NAME
4108                       && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4109                       && (TYPE_PRECISION (TREE_TYPE (off))
4110                           > TYPE_PRECISION (TREE_TYPE (rhs1))))
4111                     def = SSA_NAME_DEF_STMT (rhs1);
4112                 }
4113               if (is_gimple_call (def)
4114                   && gimple_call_internal_p (def)
4115                   && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4116                 {
4117                   tree arg = gimple_call_arg (def, 0);
4118                   tree reft = TREE_TYPE (DR_REF (newdr));
4119                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4120                   arg = SSA_NAME_VAR (arg);
4121                   if (arg == loop->simduid
4122                       /* For now.  */
4123                       && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4124                     {
4125                       DR_BASE_ADDRESS (newdr) = base_address;
4126                       DR_OFFSET (newdr) = ssize_int (0);
4127                       DR_STEP (newdr) = step;
4128                       DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4129                       DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4130                       /* Mark as simd-lane access.  */
4131                       tree arg2 = gimple_call_arg (def, 1);
4132                       newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4133                       free_data_ref (dr);
4134                       datarefs->safe_push (newdr);
4135                       return opt_result::success ();
4136                     }
4137                 }
4138             }
4139         }
4140       free_data_ref (newdr);
4141     }
4142
4143   datarefs->safe_push (dr);
4144   return opt_result::success ();
4145 }
4146
4147 /* Function vect_analyze_data_refs.
4148
4149   Find all the data references in the loop or basic block.
4150
4151    The general structure of the analysis of data refs in the vectorizer is as
4152    follows:
4153    1- vect_analyze_data_refs(loop/bb): call
4154       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4155       in the loop/bb and their dependences.
4156    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4157    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4158    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4159
4160 */
4161
4162 opt_result
4163 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4164 {
4165   struct loop *loop = NULL;
4166   unsigned int i;
4167   struct data_reference *dr;
4168   tree scalar_type;
4169
4170   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4171
4172   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4173     loop = LOOP_VINFO_LOOP (loop_vinfo);
4174
4175   /* Go through the data-refs, check that the analysis succeeded.  Update
4176      pointer from stmt_vec_info struct to DR and vectype.  */
4177
4178   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4179   FOR_EACH_VEC_ELT (datarefs, i, dr)
4180     {
4181       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4182       poly_uint64 vf;
4183
4184       gcc_assert (DR_REF (dr));
4185       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4186       gcc_assert (!stmt_info->dr_aux.dr);
4187       stmt_info->dr_aux.dr = dr;
4188       stmt_info->dr_aux.stmt = stmt_info;
4189
4190       /* Check that analysis of the data-ref succeeded.  */
4191       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4192           || !DR_STEP (dr))
4193         {
4194           bool maybe_gather
4195             = DR_IS_READ (dr)
4196               && !TREE_THIS_VOLATILE (DR_REF (dr))
4197               && (targetm.vectorize.builtin_gather != NULL
4198                   || supports_vec_gather_load_p ());
4199           bool maybe_scatter
4200             = DR_IS_WRITE (dr)
4201               && !TREE_THIS_VOLATILE (DR_REF (dr))
4202               && (targetm.vectorize.builtin_scatter != NULL
4203                   || supports_vec_scatter_store_p ());
4204
4205           /* If target supports vector gather loads or scatter stores,
4206              see if they can't be used.  */
4207           if (is_a <loop_vec_info> (vinfo)
4208               && !nested_in_vect_loop_p (loop, stmt_info))
4209             {
4210               if (maybe_gather || maybe_scatter)
4211                 {
4212                   if (maybe_gather)
4213                     gatherscatter = GATHER;
4214                   else
4215                     gatherscatter = SCATTER;
4216                 }
4217             }
4218
4219           if (gatherscatter == SG_NONE)
4220             {
4221               if (dump_enabled_p ())
4222                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223                                  "not vectorized: data ref analysis "
4224                                  "failed %G", stmt_info->stmt);
4225               if (is_a <bb_vec_info> (vinfo))
4226                 {
4227                   /* In BB vectorization the ref can still participate
4228                      in dependence analysis, we just can't vectorize it.  */
4229                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4230                   continue;
4231                 }
4232               return opt_result::failure_at (stmt_info->stmt,
4233                                              "not vectorized:"
4234                                              " data ref analysis failed: %G",
4235                                              stmt_info->stmt);
4236             }
4237         }
4238
4239       /* See if this was detected as SIMD lane access.  */
4240       if (dr->aux == (void *)-1
4241           || dr->aux == (void *)-2
4242           || dr->aux == (void *)-3
4243           || dr->aux == (void *)-4)
4244         {
4245           if (nested_in_vect_loop_p (loop, stmt_info))
4246             return opt_result::failure_at (stmt_info->stmt,
4247                                            "not vectorized:"
4248                                            " data ref analysis failed: %G",
4249                                            stmt_info->stmt);
4250           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4251             = -(uintptr_t) dr->aux;
4252         }
4253
4254       tree base = get_base_address (DR_REF (dr));
4255       if (base && VAR_P (base) && DECL_NONALIASED (base))
4256         {
4257           if (dump_enabled_p ())
4258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4259                              "not vectorized: base object not addressable "
4260                              "for stmt: %G", stmt_info->stmt);
4261           if (is_a <bb_vec_info> (vinfo))
4262             {
4263               /* In BB vectorization the ref can still participate
4264                  in dependence analysis, we just can't vectorize it.  */
4265               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4266               continue;
4267             }
4268           return opt_result::failure_at (stmt_info->stmt,
4269                                          "not vectorized: base object not"
4270                                          " addressable for stmt: %G",
4271                                          stmt_info->stmt);
4272         }
4273
4274       if (is_a <loop_vec_info> (vinfo)
4275           && DR_STEP (dr)
4276           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4277         {
4278           if (nested_in_vect_loop_p (loop, stmt_info))
4279             return opt_result::failure_at (stmt_info->stmt,
4280                                            "not vectorized:"
4281                                            "not suitable for strided load %G",
4282                                            stmt_info->stmt);
4283           STMT_VINFO_STRIDED_P (stmt_info) = true;
4284         }
4285
4286       /* Update DR field in stmt_vec_info struct.  */
4287
4288       /* If the dataref is in an inner-loop of the loop that is considered for
4289          for vectorization, we also want to analyze the access relative to
4290          the outer-loop (DR contains information only relative to the
4291          inner-most enclosing loop).  We do that by building a reference to the
4292          first location accessed by the inner-loop, and analyze it relative to
4293          the outer-loop.  */
4294       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4295         {
4296           /* Build a reference to the first location accessed by the
4297              inner loop: *(BASE + INIT + OFFSET).  By construction,
4298              this address must be invariant in the inner loop, so we
4299              can consider it as being used in the outer loop.  */
4300           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4301           tree offset = unshare_expr (DR_OFFSET (dr));
4302           tree init = unshare_expr (DR_INIT (dr));
4303           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4304                                           init, offset);
4305           tree init_addr = fold_build_pointer_plus (base, init_offset);
4306           tree init_ref = build_fold_indirect_ref (init_addr);
4307
4308           if (dump_enabled_p ())
4309             dump_printf_loc (MSG_NOTE, vect_location,
4310                              "analyze in outer loop: %T\n", init_ref);
4311
4312           opt_result res
4313             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4314                                     init_ref, loop, stmt_info->stmt);
4315           if (!res)
4316             /* dr_analyze_innermost already explained the failure.  */
4317             return res;
4318
4319           if (dump_enabled_p ())
4320             dump_printf_loc (MSG_NOTE, vect_location,
4321                              "\touter base_address: %T\n"
4322                              "\touter offset from base address: %T\n"
4323                              "\touter constant offset from base address: %T\n"
4324                              "\touter step: %T\n"
4325                              "\touter base alignment: %d\n\n"
4326                              "\touter base misalignment: %d\n"
4327                              "\touter offset alignment: %d\n"
4328                              "\touter step alignment: %d\n",
4329                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4330                              STMT_VINFO_DR_OFFSET (stmt_info),
4331                              STMT_VINFO_DR_INIT (stmt_info),
4332                              STMT_VINFO_DR_STEP (stmt_info),
4333                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4334                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4335                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4336                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4337         }
4338
4339       /* Set vectype for STMT.  */
4340       scalar_type = TREE_TYPE (DR_REF (dr));
4341       STMT_VINFO_VECTYPE (stmt_info)
4342         = get_vectype_for_scalar_type (scalar_type);
4343       if (!STMT_VINFO_VECTYPE (stmt_info))
4344         {
4345           if (dump_enabled_p ())
4346             {
4347               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4348                                "not vectorized: no vectype for stmt: %G",
4349                                stmt_info->stmt);
4350               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4351               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4352                                  scalar_type);
4353               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4354             }
4355
4356           if (is_a <bb_vec_info> (vinfo))
4357             {
4358               /* No vector type is fine, the ref can still participate
4359                  in dependence analysis, we just can't vectorize it.  */
4360               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4361               continue;
4362             }
4363           return opt_result::failure_at (stmt_info->stmt,
4364                                          "not vectorized:"
4365                                          " no vectype for stmt: %G"
4366                                          " scalar_type: %T\n",
4367                                          stmt_info->stmt, scalar_type);
4368         }
4369       else
4370         {
4371           if (dump_enabled_p ())
4372             dump_printf_loc (MSG_NOTE, vect_location,
4373                              "got vectype for stmt: %G%T\n",
4374                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4375         }
4376
4377       /* Adjust the minimal vectorization factor according to the
4378          vector type.  */
4379       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4380       *min_vf = upper_bound (*min_vf, vf);
4381
4382       if (gatherscatter != SG_NONE)
4383         {
4384           gather_scatter_info gs_info;
4385           if (!vect_check_gather_scatter (stmt_info,
4386                                           as_a <loop_vec_info> (vinfo),
4387                                           &gs_info)
4388               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4389             {
4390               if (fatal)
4391                 *fatal = false;
4392               return opt_result::failure_at
4393                         (stmt_info->stmt,
4394                          (gatherscatter == GATHER)
4395                          ? "not vectorized: not suitable for gather load %G"
4396                          : "not vectorized: not suitable for scatter store %G",
4397                          stmt_info->stmt);
4398             }
4399           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4400         }
4401     }
4402
4403   /* We used to stop processing and prune the list here.  Verify we no
4404      longer need to.  */
4405   gcc_assert (i == datarefs.length ());
4406
4407   return opt_result::success ();
4408 }
4409
4410
4411 /* Function vect_get_new_vect_var.
4412
4413    Returns a name for a new variable.  The current naming scheme appends the
4414    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4415    the name of vectorizer generated variables, and appends that to NAME if
4416    provided.  */
4417
4418 tree
4419 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4420 {
4421   const char *prefix;
4422   tree new_vect_var;
4423
4424   switch (var_kind)
4425   {
4426   case vect_simple_var:
4427     prefix = "vect";
4428     break;
4429   case vect_scalar_var:
4430     prefix = "stmp";
4431     break;
4432   case vect_mask_var:
4433     prefix = "mask";
4434     break;
4435   case vect_pointer_var:
4436     prefix = "vectp";
4437     break;
4438   default:
4439     gcc_unreachable ();
4440   }
4441
4442   if (name)
4443     {
4444       char* tmp = concat (prefix, "_", name, NULL);
4445       new_vect_var = create_tmp_reg (type, tmp);
4446       free (tmp);
4447     }
4448   else
4449     new_vect_var = create_tmp_reg (type, prefix);
4450
4451   return new_vect_var;
4452 }
4453
4454 /* Like vect_get_new_vect_var but return an SSA name.  */
4455
4456 tree
4457 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4458 {
4459   const char *prefix;
4460   tree new_vect_var;
4461
4462   switch (var_kind)
4463   {
4464   case vect_simple_var:
4465     prefix = "vect";
4466     break;
4467   case vect_scalar_var:
4468     prefix = "stmp";
4469     break;
4470   case vect_pointer_var:
4471     prefix = "vectp";
4472     break;
4473   default:
4474     gcc_unreachable ();
4475   }
4476
4477   if (name)
4478     {
4479       char* tmp = concat (prefix, "_", name, NULL);
4480       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4481       free (tmp);
4482     }
4483   else
4484     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4485
4486   return new_vect_var;
4487 }
4488
4489 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4490
4491 static void
4492 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4493 {
4494   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4495   int misalign = DR_MISALIGNMENT (dr_info);
4496   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4497     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4498   else
4499     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4500                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4501                             misalign);
4502 }
4503
4504 /* Function vect_create_addr_base_for_vector_ref.
4505
4506    Create an expression that computes the address of the first memory location
4507    that will be accessed for a data reference.
4508
4509    Input:
4510    STMT_INFO: The statement containing the data reference.
4511    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4512    OFFSET: Optional. If supplied, it is be added to the initial address.
4513    LOOP:    Specify relative to which loop-nest should the address be computed.
4514             For example, when the dataref is in an inner-loop nested in an
4515             outer-loop that is now being vectorized, LOOP can be either the
4516             outer-loop, or the inner-loop.  The first memory location accessed
4517             by the following dataref ('in' points to short):
4518
4519                 for (i=0; i<N; i++)
4520                    for (j=0; j<M; j++)
4521                      s += in[i+j]
4522
4523             is as follows:
4524             if LOOP=i_loop:     &in             (relative to i_loop)
4525             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4526    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4527             initial address.  Unlike OFFSET, which is number of elements to
4528             be added, BYTE_OFFSET is measured in bytes.
4529
4530    Output:
4531    1. Return an SSA_NAME whose value is the address of the memory location of
4532       the first vector of the data reference.
4533    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4534       these statement(s) which define the returned SSA_NAME.
4535
4536    FORNOW: We are only handling array accesses with step 1.  */
4537
4538 tree
4539 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4540                                       gimple_seq *new_stmt_list,
4541                                       tree offset,
4542                                       tree byte_offset)
4543 {
4544   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4545   struct data_reference *dr = dr_info->dr;
4546   const char *base_name;
4547   tree addr_base;
4548   tree dest;
4549   gimple_seq seq = NULL;
4550   tree vect_ptr_type;
4551   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4552   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4553   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4554
4555   tree data_ref_base = unshare_expr (drb->base_address);
4556   tree base_offset = unshare_expr (drb->offset);
4557   tree init = unshare_expr (drb->init);
4558
4559   if (loop_vinfo)
4560     base_name = get_name (data_ref_base);
4561   else
4562     {
4563       base_offset = ssize_int (0);
4564       init = ssize_int (0);
4565       base_name = get_name (DR_REF (dr));
4566     }
4567
4568   /* Create base_offset */
4569   base_offset = size_binop (PLUS_EXPR,
4570                             fold_convert (sizetype, base_offset),
4571                             fold_convert (sizetype, init));
4572
4573   if (offset)
4574     {
4575       offset = fold_build2 (MULT_EXPR, sizetype,
4576                             fold_convert (sizetype, offset), step);
4577       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4578                                  base_offset, offset);
4579     }
4580   if (byte_offset)
4581     {
4582       byte_offset = fold_convert (sizetype, byte_offset);
4583       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4584                                  base_offset, byte_offset);
4585     }
4586
4587   /* base + base_offset */
4588   if (loop_vinfo)
4589     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4590   else
4591     {
4592       addr_base = build1 (ADDR_EXPR,
4593                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4594                           unshare_expr (DR_REF (dr)));
4595     }
4596
4597   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4598   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4599   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4600   gimple_seq_add_seq (new_stmt_list, seq);
4601
4602   if (DR_PTR_INFO (dr)
4603       && TREE_CODE (addr_base) == SSA_NAME
4604       && !SSA_NAME_PTR_INFO (addr_base))
4605     {
4606       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4607       if (offset || byte_offset)
4608         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4609     }
4610
4611   if (dump_enabled_p ())
4612     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4613
4614   return addr_base;
4615 }
4616
4617
4618 /* Function vect_create_data_ref_ptr.
4619
4620    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4621    location accessed in the loop by STMT_INFO, along with the def-use update
4622    chain to appropriately advance the pointer through the loop iterations.
4623    Also set aliasing information for the pointer.  This pointer is used by
4624    the callers to this function to create a memory reference expression for
4625    vector load/store access.
4626
4627    Input:
4628    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4629          GIMPLE_ASSIGN <name, data-ref> or
4630          GIMPLE_ASSIGN <data-ref, name>.
4631    2. AGGR_TYPE: the type of the reference, which should be either a vector
4632         or an array.
4633    3. AT_LOOP: the loop where the vector memref is to be created.
4634    4. OFFSET (optional): an offset to be added to the initial address accessed
4635         by the data-ref in STMT_INFO.
4636    5. BSI: location where the new stmts are to be placed if there is no loop
4637    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4638         pointing to the initial address.
4639    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4640         to the initial address accessed by the data-ref in STMT_INFO.  This is
4641         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4642         in bytes.
4643    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4644         to the IV during each iteration of the loop.  NULL says to move
4645         by one copy of AGGR_TYPE up or down, depending on the step of the
4646         data reference.
4647
4648    Output:
4649    1. Declare a new ptr to vector_type, and have it point to the base of the
4650       data reference (initial addressed accessed by the data reference).
4651       For example, for vector of type V8HI, the following code is generated:
4652
4653       v8hi *ap;
4654       ap = (v8hi *)initial_address;
4655
4656       if OFFSET is not supplied:
4657          initial_address = &a[init];
4658       if OFFSET is supplied:
4659          initial_address = &a[init + OFFSET];
4660       if BYTE_OFFSET is supplied:
4661          initial_address = &a[init] + BYTE_OFFSET;
4662
4663       Return the initial_address in INITIAL_ADDRESS.
4664
4665    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4666       update the pointer in each iteration of the loop.
4667
4668       Return the increment stmt that updates the pointer in PTR_INCR.
4669
4670    3. Return the pointer.  */
4671
4672 tree
4673 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4674                           struct loop *at_loop, tree offset,
4675                           tree *initial_address, gimple_stmt_iterator *gsi,
4676                           gimple **ptr_incr, bool only_init,
4677                           tree byte_offset, tree iv_step)
4678 {
4679   const char *base_name;
4680   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4681   struct loop *loop = NULL;
4682   bool nested_in_vect_loop = false;
4683   struct loop *containing_loop = NULL;
4684   tree aggr_ptr_type;
4685   tree aggr_ptr;
4686   tree new_temp;
4687   gimple_seq new_stmt_list = NULL;
4688   edge pe = NULL;
4689   basic_block new_bb;
4690   tree aggr_ptr_init;
4691   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4692   struct data_reference *dr = dr_info->dr;
4693   tree aptr;
4694   gimple_stmt_iterator incr_gsi;
4695   bool insert_after;
4696   tree indx_before_incr, indx_after_incr;
4697   gimple *incr;
4698   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4699
4700   gcc_assert (iv_step != NULL_TREE
4701               || TREE_CODE (aggr_type) == ARRAY_TYPE
4702               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4703
4704   if (loop_vinfo)
4705     {
4706       loop = LOOP_VINFO_LOOP (loop_vinfo);
4707       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4708       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4709       pe = loop_preheader_edge (loop);
4710     }
4711   else
4712     {
4713       gcc_assert (bb_vinfo);
4714       only_init = true;
4715       *ptr_incr = NULL;
4716     }
4717
4718   /* Create an expression for the first address accessed by this load
4719      in LOOP.  */
4720   base_name = get_name (DR_BASE_ADDRESS (dr));
4721
4722   if (dump_enabled_p ())
4723     {
4724       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4725       dump_printf_loc (MSG_NOTE, vect_location,
4726                        "create %s-pointer variable to type: %T",
4727                        get_tree_code_name (TREE_CODE (aggr_type)),
4728                        aggr_type);
4729       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4730         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4731       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4732         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4733       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4734         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4735       else
4736         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4737       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4738     }
4739
4740   /* (1) Create the new aggregate-pointer variable.
4741      Vector and array types inherit the alias set of their component
4742      type by default so we need to use a ref-all pointer if the data
4743      reference does not conflict with the created aggregated data
4744      reference because it is not addressable.  */
4745   bool need_ref_all = false;
4746   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4747                               get_alias_set (DR_REF (dr))))
4748     need_ref_all = true;
4749   /* Likewise for any of the data references in the stmt group.  */
4750   else if (DR_GROUP_SIZE (stmt_info) > 1)
4751     {
4752       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4753       do
4754         {
4755           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4756           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4757                                       get_alias_set (DR_REF (sdr))))
4758             {
4759               need_ref_all = true;
4760               break;
4761             }
4762           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4763         }
4764       while (sinfo);
4765     }
4766   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4767                                                need_ref_all);
4768   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4769
4770
4771   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4772      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4773      def-use update cycles for the pointer: one relative to the outer-loop
4774      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4775      to the inner-loop (which is the inner-most loop containing the dataref),
4776      and this is done be step (5) below.
4777
4778      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4779      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4780      redundant.  Steps (3),(4) create the following:
4781
4782         vp0 = &base_addr;
4783         LOOP:   vp1 = phi(vp0,vp2)
4784                 ...
4785                 ...
4786                 vp2 = vp1 + step
4787                 goto LOOP
4788
4789      If there is an inner-loop nested in loop, then step (5) will also be
4790      applied, and an additional update in the inner-loop will be created:
4791
4792         vp0 = &base_addr;
4793         LOOP:   vp1 = phi(vp0,vp2)
4794                 ...
4795         inner:     vp3 = phi(vp1,vp4)
4796                    vp4 = vp3 + inner_step
4797                    if () goto inner
4798                 ...
4799                 vp2 = vp1 + step
4800                 if () goto LOOP   */
4801
4802   /* (2) Calculate the initial address of the aggregate-pointer, and set
4803      the aggregate-pointer to point to it before the loop.  */
4804
4805   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4806
4807   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4808                                                    offset, byte_offset);
4809   if (new_stmt_list)
4810     {
4811       if (pe)
4812         {
4813           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4814           gcc_assert (!new_bb);
4815         }
4816       else
4817         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4818     }
4819
4820   *initial_address = new_temp;
4821   aggr_ptr_init = new_temp;
4822
4823   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4824      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4825      inner-loop nested in LOOP (during outer-loop vectorization).  */
4826
4827   /* No update in loop is required.  */
4828   if (only_init && (!loop_vinfo || at_loop == loop))
4829     aptr = aggr_ptr_init;
4830   else
4831     {
4832       /* Accesses to invariant addresses should be handled specially
4833          by the caller.  */
4834       tree step = vect_dr_behavior (dr_info)->step;
4835       gcc_assert (!integer_zerop (step));
4836
4837       if (iv_step == NULL_TREE)
4838         {
4839           /* The step of the aggregate pointer is the type size,
4840              negated for downward accesses.  */
4841           iv_step = TYPE_SIZE_UNIT (aggr_type);
4842           if (tree_int_cst_sgn (step) == -1)
4843             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4844         }
4845
4846       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4847
4848       create_iv (aggr_ptr_init,
4849                  fold_convert (aggr_ptr_type, iv_step),
4850                  aggr_ptr, loop, &incr_gsi, insert_after,
4851                  &indx_before_incr, &indx_after_incr);
4852       incr = gsi_stmt (incr_gsi);
4853       loop_vinfo->add_stmt (incr);
4854
4855       /* Copy the points-to information if it exists. */
4856       if (DR_PTR_INFO (dr))
4857         {
4858           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4859           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4860         }
4861       if (ptr_incr)
4862         *ptr_incr = incr;
4863
4864       aptr = indx_before_incr;
4865     }
4866
4867   if (!nested_in_vect_loop || only_init)
4868     return aptr;
4869
4870
4871   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4872      nested in LOOP, if exists.  */
4873
4874   gcc_assert (nested_in_vect_loop);
4875   if (!only_init)
4876     {
4877       standard_iv_increment_position (containing_loop, &incr_gsi,
4878                                       &insert_after);
4879       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4880                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4881                  &indx_after_incr);
4882       incr = gsi_stmt (incr_gsi);
4883       loop_vinfo->add_stmt (incr);
4884
4885       /* Copy the points-to information if it exists. */
4886       if (DR_PTR_INFO (dr))
4887         {
4888           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4889           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4890         }
4891       if (ptr_incr)
4892         *ptr_incr = incr;
4893
4894       return indx_before_incr;
4895     }
4896   else
4897     gcc_unreachable ();
4898 }
4899
4900
4901 /* Function bump_vector_ptr
4902
4903    Increment a pointer (to a vector type) by vector-size. If requested,
4904    i.e. if PTR-INCR is given, then also connect the new increment stmt
4905    to the existing def-use update-chain of the pointer, by modifying
4906    the PTR_INCR as illustrated below:
4907
4908    The pointer def-use update-chain before this function:
4909                         DATAREF_PTR = phi (p_0, p_2)
4910                         ....
4911         PTR_INCR:       p_2 = DATAREF_PTR + step
4912
4913    The pointer def-use update-chain after this function:
4914                         DATAREF_PTR = phi (p_0, p_2)
4915                         ....
4916                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4917                         ....
4918         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4919
4920    Input:
4921    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4922                  in the loop.
4923    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4924               the loop.  The increment amount across iterations is expected
4925               to be vector_size.
4926    BSI - location where the new update stmt is to be placed.
4927    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4928    BUMP - optional. The offset by which to bump the pointer. If not given,
4929           the offset is assumed to be vector_size.
4930
4931    Output: Return NEW_DATAREF_PTR as illustrated above.
4932
4933 */
4934
4935 tree
4936 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4937                  stmt_vec_info stmt_info, tree bump)
4938 {
4939   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4940   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941   tree update = TYPE_SIZE_UNIT (vectype);
4942   gassign *incr_stmt;
4943   ssa_op_iter iter;
4944   use_operand_p use_p;
4945   tree new_dataref_ptr;
4946
4947   if (bump)
4948     update = bump;
4949
4950   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4951     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4952   else
4953     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4954   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4955                                    dataref_ptr, update);
4956   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4957
4958   /* Copy the points-to information if it exists. */
4959   if (DR_PTR_INFO (dr))
4960     {
4961       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4962       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4963     }
4964
4965   if (!ptr_incr)
4966     return new_dataref_ptr;
4967
4968   /* Update the vector-pointer's cross-iteration increment.  */
4969   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4970     {
4971       tree use = USE_FROM_PTR (use_p);
4972
4973       if (use == dataref_ptr)
4974         SET_USE (use_p, new_dataref_ptr);
4975       else
4976         gcc_assert (operand_equal_p (use, update, 0));
4977     }
4978
4979   return new_dataref_ptr;
4980 }
4981
4982
4983 /* Copy memory reference info such as base/clique from the SRC reference
4984    to the DEST MEM_REF.  */
4985
4986 void
4987 vect_copy_ref_info (tree dest, tree src)
4988 {
4989   if (TREE_CODE (dest) != MEM_REF)
4990     return;
4991
4992   tree src_base = src;
4993   while (handled_component_p (src_base))
4994     src_base = TREE_OPERAND (src_base, 0);
4995   if (TREE_CODE (src_base) != MEM_REF
4996       && TREE_CODE (src_base) != TARGET_MEM_REF)
4997     return;
4998
4999   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5000   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5001 }
5002
5003
5004 /* Function vect_create_destination_var.
5005
5006    Create a new temporary of type VECTYPE.  */
5007
5008 tree
5009 vect_create_destination_var (tree scalar_dest, tree vectype)
5010 {
5011   tree vec_dest;
5012   const char *name;
5013   char *new_name;
5014   tree type;
5015   enum vect_var_kind kind;
5016
5017   kind = vectype
5018     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5019     ? vect_mask_var
5020     : vect_simple_var
5021     : vect_scalar_var;
5022   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5023
5024   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5025
5026   name = get_name (scalar_dest);
5027   if (name)
5028     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5029   else
5030     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5031   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5032   free (new_name);
5033
5034   return vec_dest;
5035 }
5036
5037 /* Function vect_grouped_store_supported.
5038
5039    Returns TRUE if interleave high and interleave low permutations
5040    are supported, and FALSE otherwise.  */
5041
5042 bool
5043 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5044 {
5045   machine_mode mode = TYPE_MODE (vectype);
5046
5047   /* vect_permute_store_chain requires the group size to be equal to 3 or
5048      be a power of two.  */
5049   if (count != 3 && exact_log2 (count) == -1)
5050     {
5051       if (dump_enabled_p ())
5052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053                          "the size of the group of accesses"
5054                          " is not a power of 2 or not eqaul to 3\n");
5055       return false;
5056     }
5057
5058   /* Check that the permutation is supported.  */
5059   if (VECTOR_MODE_P (mode))
5060     {
5061       unsigned int i;
5062       if (count == 3)
5063         {
5064           unsigned int j0 = 0, j1 = 0, j2 = 0;
5065           unsigned int i, j;
5066
5067           unsigned int nelt;
5068           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5069             {
5070               if (dump_enabled_p ())
5071                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072                                  "cannot handle groups of 3 stores for"
5073                                  " variable-length vectors\n");
5074               return false;
5075             }
5076
5077           vec_perm_builder sel (nelt, nelt, 1);
5078           sel.quick_grow (nelt);
5079           vec_perm_indices indices;
5080           for (j = 0; j < 3; j++)
5081             {
5082               int nelt0 = ((3 - j) * nelt) % 3;
5083               int nelt1 = ((3 - j) * nelt + 1) % 3;
5084               int nelt2 = ((3 - j) * nelt + 2) % 3;
5085               for (i = 0; i < nelt; i++)
5086                 {
5087                   if (3 * i + nelt0 < nelt)
5088                     sel[3 * i + nelt0] = j0++;
5089                   if (3 * i + nelt1 < nelt)
5090                     sel[3 * i + nelt1] = nelt + j1++;
5091                   if (3 * i + nelt2 < nelt)
5092                     sel[3 * i + nelt2] = 0;
5093                 }
5094               indices.new_vector (sel, 2, nelt);
5095               if (!can_vec_perm_const_p (mode, indices))
5096                 {
5097                   if (dump_enabled_p ())
5098                     dump_printf (MSG_MISSED_OPTIMIZATION,
5099                                  "permutation op not supported by target.\n");
5100                   return false;
5101                 }
5102
5103               for (i = 0; i < nelt; i++)
5104                 {
5105                   if (3 * i + nelt0 < nelt)
5106                     sel[3 * i + nelt0] = 3 * i + nelt0;
5107                   if (3 * i + nelt1 < nelt)
5108                     sel[3 * i + nelt1] = 3 * i + nelt1;
5109                   if (3 * i + nelt2 < nelt)
5110                     sel[3 * i + nelt2] = nelt + j2++;
5111                 }
5112               indices.new_vector (sel, 2, nelt);
5113               if (!can_vec_perm_const_p (mode, indices))
5114                 {
5115                   if (dump_enabled_p ())
5116                     dump_printf (MSG_MISSED_OPTIMIZATION,
5117                                  "permutation op not supported by target.\n");
5118                   return false;
5119                 }
5120             }
5121           return true;
5122         }
5123       else
5124         {
5125           /* If length is not equal to 3 then only power of 2 is supported.  */
5126           gcc_assert (pow2p_hwi (count));
5127           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5128
5129           /* The encoding has 2 interleaved stepped patterns.  */
5130           vec_perm_builder sel (nelt, 2, 3);
5131           sel.quick_grow (6);
5132           for (i = 0; i < 3; i++)
5133             {
5134               sel[i * 2] = i;
5135               sel[i * 2 + 1] = i + nelt;
5136             }
5137           vec_perm_indices indices (sel, 2, nelt);
5138           if (can_vec_perm_const_p (mode, indices))
5139             {
5140               for (i = 0; i < 6; i++)
5141                 sel[i] += exact_div (nelt, 2);
5142               indices.new_vector (sel, 2, nelt);
5143               if (can_vec_perm_const_p (mode, indices))
5144                 return true;
5145             }
5146         }
5147     }
5148
5149   if (dump_enabled_p ())
5150     dump_printf (MSG_MISSED_OPTIMIZATION,
5151                  "permutation op not supported by target.\n");
5152   return false;
5153 }
5154
5155
5156 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5157    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5158
5159 bool
5160 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5161                             bool masked_p)
5162 {
5163   if (masked_p)
5164     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5165                                          vec_mask_store_lanes_optab,
5166                                          vectype, count);
5167   else
5168     return vect_lanes_optab_supported_p ("vec_store_lanes",
5169                                          vec_store_lanes_optab,
5170                                          vectype, count);
5171 }
5172
5173
5174 /* Function vect_permute_store_chain.
5175
5176    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5177    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5178    the data correctly for the stores.  Return the final references for stores
5179    in RESULT_CHAIN.
5180
5181    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5182    The input is 4 vectors each containing 8 elements.  We assign a number to
5183    each element, the input sequence is:
5184
5185    1st vec:   0  1  2  3  4  5  6  7
5186    2nd vec:   8  9 10 11 12 13 14 15
5187    3rd vec:  16 17 18 19 20 21 22 23
5188    4th vec:  24 25 26 27 28 29 30 31
5189
5190    The output sequence should be:
5191
5192    1st vec:  0  8 16 24  1  9 17 25
5193    2nd vec:  2 10 18 26  3 11 19 27
5194    3rd vec:  4 12 20 28  5 13 21 30
5195    4th vec:  6 14 22 30  7 15 23 31
5196
5197    i.e., we interleave the contents of the four vectors in their order.
5198
5199    We use interleave_high/low instructions to create such output.  The input of
5200    each interleave_high/low operation is two vectors:
5201    1st vec    2nd vec
5202    0 1 2 3    4 5 6 7
5203    the even elements of the result vector are obtained left-to-right from the
5204    high/low elements of the first vector.  The odd elements of the result are
5205    obtained left-to-right from the high/low elements of the second vector.
5206    The output of interleave_high will be:   0 4 1 5
5207    and of interleave_low:                   2 6 3 7
5208
5209
5210    The permutation is done in log LENGTH stages.  In each stage interleave_high
5211    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5212    where the first argument is taken from the first half of DR_CHAIN and the
5213    second argument from it's second half.
5214    In our example,
5215
5216    I1: interleave_high (1st vec, 3rd vec)
5217    I2: interleave_low (1st vec, 3rd vec)
5218    I3: interleave_high (2nd vec, 4th vec)
5219    I4: interleave_low (2nd vec, 4th vec)
5220
5221    The output for the first stage is:
5222
5223    I1:  0 16  1 17  2 18  3 19
5224    I2:  4 20  5 21  6 22  7 23
5225    I3:  8 24  9 25 10 26 11 27
5226    I4: 12 28 13 29 14 30 15 31
5227
5228    The output of the second stage, i.e. the final result is:
5229
5230    I1:  0  8 16 24  1  9 17 25
5231    I2:  2 10 18 26  3 11 19 27
5232    I3:  4 12 20 28  5 13 21 30
5233    I4:  6 14 22 30  7 15 23 31.  */
5234
5235 void
5236 vect_permute_store_chain (vec<tree> dr_chain,
5237                           unsigned int length,
5238                           stmt_vec_info stmt_info,
5239                           gimple_stmt_iterator *gsi,
5240                           vec<tree> *result_chain)
5241 {
5242   tree vect1, vect2, high, low;
5243   gimple *perm_stmt;
5244   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5245   tree perm_mask_low, perm_mask_high;
5246   tree data_ref;
5247   tree perm3_mask_low, perm3_mask_high;
5248   unsigned int i, j, n, log_length = exact_log2 (length);
5249
5250   result_chain->quick_grow (length);
5251   memcpy (result_chain->address (), dr_chain.address (),
5252           length * sizeof (tree));
5253
5254   if (length == 3)
5255     {
5256       /* vect_grouped_store_supported ensures that this is constant.  */
5257       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5258       unsigned int j0 = 0, j1 = 0, j2 = 0;
5259
5260       vec_perm_builder sel (nelt, nelt, 1);
5261       sel.quick_grow (nelt);
5262       vec_perm_indices indices;
5263       for (j = 0; j < 3; j++)
5264         {
5265           int nelt0 = ((3 - j) * nelt) % 3;
5266           int nelt1 = ((3 - j) * nelt + 1) % 3;
5267           int nelt2 = ((3 - j) * nelt + 2) % 3;
5268
5269           for (i = 0; i < nelt; i++)
5270             {
5271               if (3 * i + nelt0 < nelt)
5272                 sel[3 * i + nelt0] = j0++;
5273               if (3 * i + nelt1 < nelt)
5274                 sel[3 * i + nelt1] = nelt + j1++;
5275               if (3 * i + nelt2 < nelt)
5276                 sel[3 * i + nelt2] = 0;
5277             }
5278           indices.new_vector (sel, 2, nelt);
5279           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5280
5281           for (i = 0; i < nelt; i++)
5282             {
5283               if (3 * i + nelt0 < nelt)
5284                 sel[3 * i + nelt0] = 3 * i + nelt0;
5285               if (3 * i + nelt1 < nelt)
5286                 sel[3 * i + nelt1] = 3 * i + nelt1;
5287               if (3 * i + nelt2 < nelt)
5288                 sel[3 * i + nelt2] = nelt + j2++;
5289             }
5290           indices.new_vector (sel, 2, nelt);
5291           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5292
5293           vect1 = dr_chain[0];
5294           vect2 = dr_chain[1];
5295
5296           /* Create interleaving stmt:
5297              low = VEC_PERM_EXPR <vect1, vect2,
5298                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5299                                    j + 2, nelt + j + 2, *, ...}>  */
5300           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5302                                            vect2, perm3_mask_low);
5303           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5304
5305           vect1 = data_ref;
5306           vect2 = dr_chain[2];
5307           /* Create interleaving stmt:
5308              low = VEC_PERM_EXPR <vect1, vect2,
5309                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5310                                    6, 7, nelt + j + 2, ...}>  */
5311           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5312           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313                                            vect2, perm3_mask_high);
5314           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5315           (*result_chain)[j] = data_ref;
5316         }
5317     }
5318   else
5319     {
5320       /* If length is not equal to 3 then only power of 2 is supported.  */
5321       gcc_assert (pow2p_hwi (length));
5322
5323       /* The encoding has 2 interleaved stepped patterns.  */
5324       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325       vec_perm_builder sel (nelt, 2, 3);
5326       sel.quick_grow (6);
5327       for (i = 0; i < 3; i++)
5328         {
5329           sel[i * 2] = i;
5330           sel[i * 2 + 1] = i + nelt;
5331         }
5332         vec_perm_indices indices (sel, 2, nelt);
5333         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5334
5335         for (i = 0; i < 6; i++)
5336           sel[i] += exact_div (nelt, 2);
5337         indices.new_vector (sel, 2, nelt);
5338         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5339
5340         for (i = 0, n = log_length; i < n; i++)
5341           {
5342             for (j = 0; j < length/2; j++)
5343               {
5344                 vect1 = dr_chain[j];
5345                 vect2 = dr_chain[j+length/2];
5346
5347                 /* Create interleaving stmt:
5348                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5349                                                         ...}>  */
5350                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5351                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5352                                                  vect2, perm_mask_high);
5353                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5354                 (*result_chain)[2*j] = high;
5355
5356                 /* Create interleaving stmt:
5357                    low = VEC_PERM_EXPR <vect1, vect2,
5358                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5359                                          ...}>  */
5360                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5361                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5362                                                  vect2, perm_mask_low);
5363                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5364                 (*result_chain)[2*j+1] = low;
5365               }
5366             memcpy (dr_chain.address (), result_chain->address (),
5367                     length * sizeof (tree));
5368           }
5369     }
5370 }
5371
5372 /* Function vect_setup_realignment
5373
5374    This function is called when vectorizing an unaligned load using
5375    the dr_explicit_realign[_optimized] scheme.
5376    This function generates the following code at the loop prolog:
5377
5378       p = initial_addr;
5379    x  msq_init = *(floor(p));   # prolog load
5380       realignment_token = call target_builtin;
5381     loop:
5382    x  msq = phi (msq_init, ---)
5383
5384    The stmts marked with x are generated only for the case of
5385    dr_explicit_realign_optimized.
5386
5387    The code above sets up a new (vector) pointer, pointing to the first
5388    location accessed by STMT_INFO, and a "floor-aligned" load using that
5389    pointer.  It also generates code to compute the "realignment-token"
5390    (if the relevant target hook was defined), and creates a phi-node at the
5391    loop-header bb whose arguments are the result of the prolog-load (created
5392    by this function) and the result of a load that takes place in the loop
5393    (to be created by the caller to this function).
5394
5395    For the case of dr_explicit_realign_optimized:
5396    The caller to this function uses the phi-result (msq) to create the
5397    realignment code inside the loop, and sets up the missing phi argument,
5398    as follows:
5399     loop:
5400       msq = phi (msq_init, lsq)
5401       lsq = *(floor(p'));        # load in loop
5402       result = realign_load (msq, lsq, realignment_token);
5403
5404    For the case of dr_explicit_realign:
5405     loop:
5406       msq = *(floor(p));        # load in loop
5407       p' = p + (VS-1);
5408       lsq = *(floor(p'));       # load in loop
5409       result = realign_load (msq, lsq, realignment_token);
5410
5411    Input:
5412    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5413                a memory location that may be unaligned.
5414    BSI - place where new code is to be inserted.
5415    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5416                               is used.
5417
5418    Output:
5419    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5420                        target hook, if defined.
5421    Return value - the result of the loop-header phi node.  */
5422
5423 tree
5424 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5425                         tree *realignment_token,
5426                         enum dr_alignment_support alignment_support_scheme,
5427                         tree init_addr,
5428                         struct loop **at_loop)
5429 {
5430   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5431   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5432   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5433   struct data_reference *dr = dr_info->dr;
5434   struct loop *loop = NULL;
5435   edge pe = NULL;
5436   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5437   tree vec_dest;
5438   gimple *inc;
5439   tree ptr;
5440   tree data_ref;
5441   basic_block new_bb;
5442   tree msq_init = NULL_TREE;
5443   tree new_temp;
5444   gphi *phi_stmt;
5445   tree msq = NULL_TREE;
5446   gimple_seq stmts = NULL;
5447   bool compute_in_loop = false;
5448   bool nested_in_vect_loop = false;
5449   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5450   struct loop *loop_for_initial_load = NULL;
5451
5452   if (loop_vinfo)
5453     {
5454       loop = LOOP_VINFO_LOOP (loop_vinfo);
5455       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5456     }
5457
5458   gcc_assert (alignment_support_scheme == dr_explicit_realign
5459               || alignment_support_scheme == dr_explicit_realign_optimized);
5460
5461   /* We need to generate three things:
5462      1. the misalignment computation
5463      2. the extra vector load (for the optimized realignment scheme).
5464      3. the phi node for the two vectors from which the realignment is
5465       done (for the optimized realignment scheme).  */
5466
5467   /* 1. Determine where to generate the misalignment computation.
5468
5469      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5470      calculation will be generated by this function, outside the loop (in the
5471      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5472      caller, inside the loop.
5473
5474      Background: If the misalignment remains fixed throughout the iterations of
5475      the loop, then both realignment schemes are applicable, and also the
5476      misalignment computation can be done outside LOOP.  This is because we are
5477      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5478      are a multiple of VS (the Vector Size), and therefore the misalignment in
5479      different vectorized LOOP iterations is always the same.
5480      The problem arises only if the memory access is in an inner-loop nested
5481      inside LOOP, which is now being vectorized using outer-loop vectorization.
5482      This is the only case when the misalignment of the memory access may not
5483      remain fixed throughout the iterations of the inner-loop (as explained in
5484      detail in vect_supportable_dr_alignment).  In this case, not only is the
5485      optimized realignment scheme not applicable, but also the misalignment
5486      computation (and generation of the realignment token that is passed to
5487      REALIGN_LOAD) have to be done inside the loop.
5488
5489      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5490      or not, which in turn determines if the misalignment is computed inside
5491      the inner-loop, or outside LOOP.  */
5492
5493   if (init_addr != NULL_TREE || !loop_vinfo)
5494     {
5495       compute_in_loop = true;
5496       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5497     }
5498
5499
5500   /* 2. Determine where to generate the extra vector load.
5501
5502      For the optimized realignment scheme, instead of generating two vector
5503      loads in each iteration, we generate a single extra vector load in the
5504      preheader of the loop, and in each iteration reuse the result of the
5505      vector load from the previous iteration.  In case the memory access is in
5506      an inner-loop nested inside LOOP, which is now being vectorized using
5507      outer-loop vectorization, we need to determine whether this initial vector
5508      load should be generated at the preheader of the inner-loop, or can be
5509      generated at the preheader of LOOP.  If the memory access has no evolution
5510      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5511      to be generated inside LOOP (in the preheader of the inner-loop).  */
5512
5513   if (nested_in_vect_loop)
5514     {
5515       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5516       bool invariant_in_outerloop =
5517             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5518       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5519     }
5520   else
5521     loop_for_initial_load = loop;
5522   if (at_loop)
5523     *at_loop = loop_for_initial_load;
5524
5525   if (loop_for_initial_load)
5526     pe = loop_preheader_edge (loop_for_initial_load);
5527
5528   /* 3. For the case of the optimized realignment, create the first vector
5529       load at the loop preheader.  */
5530
5531   if (alignment_support_scheme == dr_explicit_realign_optimized)
5532     {
5533       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5534       gassign *new_stmt;
5535
5536       gcc_assert (!compute_in_loop);
5537       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5538       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5539                                       loop_for_initial_load, NULL_TREE,
5540                                       &init_addr, NULL, &inc, true);
5541       if (TREE_CODE (ptr) == SSA_NAME)
5542         new_temp = copy_ssa_name (ptr);
5543       else
5544         new_temp = make_ssa_name (TREE_TYPE (ptr));
5545       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5546       tree type = TREE_TYPE (ptr);
5547       new_stmt = gimple_build_assign
5548                    (new_temp, BIT_AND_EXPR, ptr,
5549                     fold_build2 (MINUS_EXPR, type,
5550                                  build_int_cst (type, 0),
5551                                  build_int_cst (type, align)));
5552       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5553       gcc_assert (!new_bb);
5554       data_ref
5555         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5556                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5557       vect_copy_ref_info (data_ref, DR_REF (dr));
5558       new_stmt = gimple_build_assign (vec_dest, data_ref);
5559       new_temp = make_ssa_name (vec_dest, new_stmt);
5560       gimple_assign_set_lhs (new_stmt, new_temp);
5561       if (pe)
5562         {
5563           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5564           gcc_assert (!new_bb);
5565         }
5566       else
5567          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5568
5569       msq_init = gimple_assign_lhs (new_stmt);
5570     }
5571
5572   /* 4. Create realignment token using a target builtin, if available.
5573       It is done either inside the containing loop, or before LOOP (as
5574       determined above).  */
5575
5576   if (targetm.vectorize.builtin_mask_for_load)
5577     {
5578       gcall *new_stmt;
5579       tree builtin_decl;
5580
5581       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5582       if (!init_addr)
5583         {
5584           /* Generate the INIT_ADDR computation outside LOOP.  */
5585           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5586                                                             NULL_TREE);
5587           if (loop)
5588             {
5589               pe = loop_preheader_edge (loop);
5590               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5591               gcc_assert (!new_bb);
5592             }
5593           else
5594              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5595         }
5596
5597       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5598       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5599       vec_dest =
5600         vect_create_destination_var (scalar_dest,
5601                                      gimple_call_return_type (new_stmt));
5602       new_temp = make_ssa_name (vec_dest, new_stmt);
5603       gimple_call_set_lhs (new_stmt, new_temp);
5604
5605       if (compute_in_loop)
5606         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5607       else
5608         {
5609           /* Generate the misalignment computation outside LOOP.  */
5610           pe = loop_preheader_edge (loop);
5611           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5612           gcc_assert (!new_bb);
5613         }
5614
5615       *realignment_token = gimple_call_lhs (new_stmt);
5616
5617       /* The result of the CALL_EXPR to this builtin is determined from
5618          the value of the parameter and no global variables are touched
5619          which makes the builtin a "const" function.  Requiring the
5620          builtin to have the "const" attribute makes it unnecessary
5621          to call mark_call_clobbered.  */
5622       gcc_assert (TREE_READONLY (builtin_decl));
5623     }
5624
5625   if (alignment_support_scheme == dr_explicit_realign)
5626     return msq;
5627
5628   gcc_assert (!compute_in_loop);
5629   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5630
5631
5632   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5633
5634   pe = loop_preheader_edge (containing_loop);
5635   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5636   msq = make_ssa_name (vec_dest);
5637   phi_stmt = create_phi_node (msq, containing_loop->header);
5638   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5639
5640   return msq;
5641 }
5642
5643
5644 /* Function vect_grouped_load_supported.
5645
5646    COUNT is the size of the load group (the number of statements plus the
5647    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5648    only one statement, with a gap of COUNT - 1.
5649
5650    Returns true if a suitable permute exists.  */
5651
5652 bool
5653 vect_grouped_load_supported (tree vectype, bool single_element_p,
5654                              unsigned HOST_WIDE_INT count)
5655 {
5656   machine_mode mode = TYPE_MODE (vectype);
5657
5658   /* If this is single-element interleaving with an element distance
5659      that leaves unused vector loads around punt - we at least create
5660      very sub-optimal code in that case (and blow up memory,
5661      see PR65518).  */
5662   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5663     {
5664       if (dump_enabled_p ())
5665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5666                          "single-element interleaving not supported "
5667                          "for not adjacent vector loads\n");
5668       return false;
5669     }
5670
5671   /* vect_permute_load_chain requires the group size to be equal to 3 or
5672      be a power of two.  */
5673   if (count != 3 && exact_log2 (count) == -1)
5674     {
5675       if (dump_enabled_p ())
5676         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5677                          "the size of the group of accesses"
5678                          " is not a power of 2 or not equal to 3\n");
5679       return false;
5680     }
5681
5682   /* Check that the permutation is supported.  */
5683   if (VECTOR_MODE_P (mode))
5684     {
5685       unsigned int i, j;
5686       if (count == 3)
5687         {
5688           unsigned int nelt;
5689           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5690             {
5691               if (dump_enabled_p ())
5692                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693                                  "cannot handle groups of 3 loads for"
5694                                  " variable-length vectors\n");
5695               return false;
5696             }
5697
5698           vec_perm_builder sel (nelt, nelt, 1);
5699           sel.quick_grow (nelt);
5700           vec_perm_indices indices;
5701           unsigned int k;
5702           for (k = 0; k < 3; k++)
5703             {
5704               for (i = 0; i < nelt; i++)
5705                 if (3 * i + k < 2 * nelt)
5706                   sel[i] = 3 * i + k;
5707                 else
5708                   sel[i] = 0;
5709               indices.new_vector (sel, 2, nelt);
5710               if (!can_vec_perm_const_p (mode, indices))
5711                 {
5712                   if (dump_enabled_p ())
5713                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5714                                      "shuffle of 3 loads is not supported by"
5715                                      " target\n");
5716                   return false;
5717                 }
5718               for (i = 0, j = 0; i < nelt; i++)
5719                 if (3 * i + k < 2 * nelt)
5720                   sel[i] = i;
5721                 else
5722                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5723               indices.new_vector (sel, 2, nelt);
5724               if (!can_vec_perm_const_p (mode, indices))
5725                 {
5726                   if (dump_enabled_p ())
5727                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728                                      "shuffle of 3 loads is not supported by"
5729                                      " target\n");
5730                   return false;
5731                 }
5732             }
5733           return true;
5734         }
5735       else
5736         {
5737           /* If length is not equal to 3 then only power of 2 is supported.  */
5738           gcc_assert (pow2p_hwi (count));
5739           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5740
5741           /* The encoding has a single stepped pattern.  */
5742           vec_perm_builder sel (nelt, 1, 3);
5743           sel.quick_grow (3);
5744           for (i = 0; i < 3; i++)
5745             sel[i] = i * 2;
5746           vec_perm_indices indices (sel, 2, nelt);
5747           if (can_vec_perm_const_p (mode, indices))
5748             {
5749               for (i = 0; i < 3; i++)
5750                 sel[i] = i * 2 + 1;
5751               indices.new_vector (sel, 2, nelt);
5752               if (can_vec_perm_const_p (mode, indices))
5753                 return true;
5754             }
5755         }
5756     }
5757
5758   if (dump_enabled_p ())
5759     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5760                      "extract even/odd not supported by target\n");
5761   return false;
5762 }
5763
5764 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5765    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5766
5767 bool
5768 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5769                            bool masked_p)
5770 {
5771   if (masked_p)
5772     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5773                                          vec_mask_load_lanes_optab,
5774                                          vectype, count);
5775   else
5776     return vect_lanes_optab_supported_p ("vec_load_lanes",
5777                                          vec_load_lanes_optab,
5778                                          vectype, count);
5779 }
5780
5781 /* Function vect_permute_load_chain.
5782
5783    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5784    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5785    the input data correctly.  Return the final references for loads in
5786    RESULT_CHAIN.
5787
5788    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5789    The input is 4 vectors each containing 8 elements. We assign a number to each
5790    element, the input sequence is:
5791
5792    1st vec:   0  1  2  3  4  5  6  7
5793    2nd vec:   8  9 10 11 12 13 14 15
5794    3rd vec:  16 17 18 19 20 21 22 23
5795    4th vec:  24 25 26 27 28 29 30 31
5796
5797    The output sequence should be:
5798
5799    1st vec:  0 4  8 12 16 20 24 28
5800    2nd vec:  1 5  9 13 17 21 25 29
5801    3rd vec:  2 6 10 14 18 22 26 30
5802    4th vec:  3 7 11 15 19 23 27 31
5803
5804    i.e., the first output vector should contain the first elements of each
5805    interleaving group, etc.
5806
5807    We use extract_even/odd instructions to create such output.  The input of
5808    each extract_even/odd operation is two vectors
5809    1st vec    2nd vec
5810    0 1 2 3    4 5 6 7
5811
5812    and the output is the vector of extracted even/odd elements.  The output of
5813    extract_even will be:   0 2 4 6
5814    and of extract_odd:     1 3 5 7
5815
5816
5817    The permutation is done in log LENGTH stages.  In each stage extract_even
5818    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5819    their order.  In our example,
5820
5821    E1: extract_even (1st vec, 2nd vec)
5822    E2: extract_odd (1st vec, 2nd vec)
5823    E3: extract_even (3rd vec, 4th vec)
5824    E4: extract_odd (3rd vec, 4th vec)
5825
5826    The output for the first stage will be:
5827
5828    E1:  0  2  4  6  8 10 12 14
5829    E2:  1  3  5  7  9 11 13 15
5830    E3: 16 18 20 22 24 26 28 30
5831    E4: 17 19 21 23 25 27 29 31
5832
5833    In order to proceed and create the correct sequence for the next stage (or
5834    for the correct output, if the second stage is the last one, as in our
5835    example), we first put the output of extract_even operation and then the
5836    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5837    The input for the second stage is:
5838
5839    1st vec (E1):  0  2  4  6  8 10 12 14
5840    2nd vec (E3): 16 18 20 22 24 26 28 30
5841    3rd vec (E2):  1  3  5  7  9 11 13 15
5842    4th vec (E4): 17 19 21 23 25 27 29 31
5843
5844    The output of the second stage:
5845
5846    E1: 0 4  8 12 16 20 24 28
5847    E2: 2 6 10 14 18 22 26 30
5848    E3: 1 5  9 13 17 21 25 29
5849    E4: 3 7 11 15 19 23 27 31
5850
5851    And RESULT_CHAIN after reordering:
5852
5853    1st vec (E1):  0 4  8 12 16 20 24 28
5854    2nd vec (E3):  1 5  9 13 17 21 25 29
5855    3rd vec (E2):  2 6 10 14 18 22 26 30
5856    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5857
5858 static void
5859 vect_permute_load_chain (vec<tree> dr_chain,
5860                          unsigned int length,
5861                          stmt_vec_info stmt_info,
5862                          gimple_stmt_iterator *gsi,
5863                          vec<tree> *result_chain)
5864 {
5865   tree data_ref, first_vect, second_vect;
5866   tree perm_mask_even, perm_mask_odd;
5867   tree perm3_mask_low, perm3_mask_high;
5868   gimple *perm_stmt;
5869   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5870   unsigned int i, j, log_length = exact_log2 (length);
5871
5872   result_chain->quick_grow (length);
5873   memcpy (result_chain->address (), dr_chain.address (),
5874           length * sizeof (tree));
5875
5876   if (length == 3)
5877     {
5878       /* vect_grouped_load_supported ensures that this is constant.  */
5879       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5880       unsigned int k;
5881
5882       vec_perm_builder sel (nelt, nelt, 1);
5883       sel.quick_grow (nelt);
5884       vec_perm_indices indices;
5885       for (k = 0; k < 3; k++)
5886         {
5887           for (i = 0; i < nelt; i++)
5888             if (3 * i + k < 2 * nelt)
5889               sel[i] = 3 * i + k;
5890             else
5891               sel[i] = 0;
5892           indices.new_vector (sel, 2, nelt);
5893           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5894
5895           for (i = 0, j = 0; i < nelt; i++)
5896             if (3 * i + k < 2 * nelt)
5897               sel[i] = i;
5898             else
5899               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5900           indices.new_vector (sel, 2, nelt);
5901           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5902
5903           first_vect = dr_chain[0];
5904           second_vect = dr_chain[1];
5905
5906           /* Create interleaving stmt (low part of):
5907              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5908                                                              ...}>  */
5909           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5910           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5911                                            second_vect, perm3_mask_low);
5912           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5913
5914           /* Create interleaving stmt (high part of):
5915              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5916                                                               ...}>  */
5917           first_vect = data_ref;
5918           second_vect = dr_chain[2];
5919           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5920           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5921                                            second_vect, perm3_mask_high);
5922           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5923           (*result_chain)[k] = data_ref;
5924         }
5925     }
5926   else
5927     {
5928       /* If length is not equal to 3 then only power of 2 is supported.  */
5929       gcc_assert (pow2p_hwi (length));
5930
5931       /* The encoding has a single stepped pattern.  */
5932       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5933       vec_perm_builder sel (nelt, 1, 3);
5934       sel.quick_grow (3);
5935       for (i = 0; i < 3; ++i)
5936         sel[i] = i * 2;
5937       vec_perm_indices indices (sel, 2, nelt);
5938       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5939
5940       for (i = 0; i < 3; ++i)
5941         sel[i] = i * 2 + 1;
5942       indices.new_vector (sel, 2, nelt);
5943       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5944
5945       for (i = 0; i < log_length; i++)
5946         {
5947           for (j = 0; j < length; j += 2)
5948             {
5949               first_vect = dr_chain[j];
5950               second_vect = dr_chain[j+1];
5951
5952               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5953               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5954               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5955                                                first_vect, second_vect,
5956                                                perm_mask_even);
5957               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5958               (*result_chain)[j/2] = data_ref;
5959
5960               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5961               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5962               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5963                                                first_vect, second_vect,
5964                                                perm_mask_odd);
5965               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5966               (*result_chain)[j/2+length/2] = data_ref;
5967             }
5968           memcpy (dr_chain.address (), result_chain->address (),
5969                   length * sizeof (tree));
5970         }
5971     }
5972 }
5973
5974 /* Function vect_shift_permute_load_chain.
5975
5976    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5977    sequence of stmts to reorder the input data accordingly.
5978    Return the final references for loads in RESULT_CHAIN.
5979    Return true if successed, false otherwise.
5980
5981    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5982    The input is 3 vectors each containing 8 elements.  We assign a
5983    number to each element, the input sequence is:
5984
5985    1st vec:   0  1  2  3  4  5  6  7
5986    2nd vec:   8  9 10 11 12 13 14 15
5987    3rd vec:  16 17 18 19 20 21 22 23
5988
5989    The output sequence should be:
5990
5991    1st vec:  0 3 6  9 12 15 18 21
5992    2nd vec:  1 4 7 10 13 16 19 22
5993    3rd vec:  2 5 8 11 14 17 20 23
5994
5995    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5996
5997    First we shuffle all 3 vectors to get correct elements order:
5998
5999    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6000    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6001    3rd vec:  (16 19 22) (17 20 23) (18 21)
6002
6003    Next we unite and shift vector 3 times:
6004
6005    1st step:
6006      shift right by 6 the concatenation of:
6007      "1st vec" and  "2nd vec"
6008        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6009      "2nd vec" and  "3rd vec"
6010        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6011      "3rd vec" and  "1st vec"
6012        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6013                              | New vectors                   |
6014
6015      So that now new vectors are:
6016
6017      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6018      2nd vec:  (10 13) (16 19 22) (17 20 23)
6019      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6020
6021    2nd step:
6022      shift right by 5 the concatenation of:
6023      "1st vec" and  "3rd vec"
6024        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6025      "2nd vec" and  "1st vec"
6026        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6027      "3rd vec" and  "2nd vec"
6028        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6029                           | New vectors                   |
6030
6031      So that now new vectors are:
6032
6033      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6034      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6035      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6036
6037    3rd step:
6038      shift right by 5 the concatenation of:
6039      "1st vec" and  "1st vec"
6040        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6041      shift right by 3 the concatenation of:
6042      "2nd vec" and  "2nd vec"
6043                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6044                           | New vectors                   |
6045
6046      So that now all vectors are READY:
6047      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6048      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6049      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6050
6051    This algorithm is faster than one in vect_permute_load_chain if:
6052      1.  "shift of a concatination" is faster than general permutation.
6053          This is usually so.
6054      2.  The TARGET machine can't execute vector instructions in parallel.
6055          This is because each step of the algorithm depends on previous.
6056          The algorithm in vect_permute_load_chain is much more parallel.
6057
6058    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6059 */
6060
6061 static bool
6062 vect_shift_permute_load_chain (vec<tree> dr_chain,
6063                                unsigned int length,
6064                                stmt_vec_info stmt_info,
6065                                gimple_stmt_iterator *gsi,
6066                                vec<tree> *result_chain)
6067 {
6068   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6069   tree perm2_mask1, perm2_mask2, perm3_mask;
6070   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6071   gimple *perm_stmt;
6072
6073   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6074   unsigned int i;
6075   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6076
6077   unsigned HOST_WIDE_INT nelt, vf;
6078   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6079       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6080     /* Not supported for variable-length vectors.  */
6081     return false;
6082
6083   vec_perm_builder sel (nelt, nelt, 1);
6084   sel.quick_grow (nelt);
6085
6086   result_chain->quick_grow (length);
6087   memcpy (result_chain->address (), dr_chain.address (),
6088           length * sizeof (tree));
6089
6090   if (pow2p_hwi (length) && vf > 4)
6091     {
6092       unsigned int j, log_length = exact_log2 (length);
6093       for (i = 0; i < nelt / 2; ++i)
6094         sel[i] = i * 2;
6095       for (i = 0; i < nelt / 2; ++i)
6096         sel[nelt / 2 + i] = i * 2 + 1;
6097       vec_perm_indices indices (sel, 2, nelt);
6098       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6099         {
6100           if (dump_enabled_p ())
6101             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6102                              "shuffle of 2 fields structure is not \
6103                               supported by target\n");
6104           return false;
6105         }
6106       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6107
6108       for (i = 0; i < nelt / 2; ++i)
6109         sel[i] = i * 2 + 1;
6110       for (i = 0; i < nelt / 2; ++i)
6111         sel[nelt / 2 + i] = i * 2;
6112       indices.new_vector (sel, 2, nelt);
6113       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6114         {
6115           if (dump_enabled_p ())
6116             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117                              "shuffle of 2 fields structure is not \
6118                               supported by target\n");
6119           return false;
6120         }
6121       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6122
6123       /* Generating permutation constant to shift all elements.
6124          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6125       for (i = 0; i < nelt; i++)
6126         sel[i] = nelt / 2 + i;
6127       indices.new_vector (sel, 2, nelt);
6128       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129         {
6130           if (dump_enabled_p ())
6131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132                              "shift permutation is not supported by target\n");
6133           return false;
6134         }
6135       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6136
6137       /* Generating permutation constant to select vector from 2.
6138          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6139       for (i = 0; i < nelt / 2; i++)
6140         sel[i] = i;
6141       for (i = nelt / 2; i < nelt; i++)
6142         sel[i] = nelt + i;
6143       indices.new_vector (sel, 2, nelt);
6144       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6145         {
6146           if (dump_enabled_p ())
6147             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148                              "select is not supported by target\n");
6149           return false;
6150         }
6151       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6152
6153       for (i = 0; i < log_length; i++)
6154         {
6155           for (j = 0; j < length; j += 2)
6156             {
6157               first_vect = dr_chain[j];
6158               second_vect = dr_chain[j + 1];
6159
6160               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6161               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6162                                                first_vect, first_vect,
6163                                                perm2_mask1);
6164               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6165               vect[0] = data_ref;
6166
6167               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6168               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6169                                                second_vect, second_vect,
6170                                                perm2_mask2);
6171               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6172               vect[1] = data_ref;
6173
6174               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6175               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6176                                                vect[0], vect[1], shift1_mask);
6177               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6178               (*result_chain)[j/2 + length/2] = data_ref;
6179
6180               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6181               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182                                                vect[0], vect[1], select_mask);
6183               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6184               (*result_chain)[j/2] = data_ref;
6185             }
6186           memcpy (dr_chain.address (), result_chain->address (),
6187                   length * sizeof (tree));
6188         }
6189       return true;
6190     }
6191   if (length == 3 && vf > 2)
6192     {
6193       unsigned int k = 0, l = 0;
6194
6195       /* Generating permutation constant to get all elements in rigth order.
6196          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6197       for (i = 0; i < nelt; i++)
6198         {
6199           if (3 * k + (l % 3) >= nelt)
6200             {
6201               k = 0;
6202               l += (3 - (nelt % 3));
6203             }
6204           sel[i] = 3 * k + (l % 3);
6205           k++;
6206         }
6207       vec_perm_indices indices (sel, 2, nelt);
6208       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6209         {
6210           if (dump_enabled_p ())
6211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212                              "shuffle of 3 fields structure is not \
6213                               supported by target\n");
6214           return false;
6215         }
6216       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6217
6218       /* Generating permutation constant to shift all elements.
6219          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6220       for (i = 0; i < nelt; i++)
6221         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6222       indices.new_vector (sel, 2, nelt);
6223       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6224         {
6225           if (dump_enabled_p ())
6226             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227                              "shift permutation is not supported by target\n");
6228           return false;
6229         }
6230       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6231
6232       /* Generating permutation constant to shift all elements.
6233          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6234       for (i = 0; i < nelt; i++)
6235         sel[i] = 2 * (nelt / 3) + 1 + i;
6236       indices.new_vector (sel, 2, nelt);
6237       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6238         {
6239           if (dump_enabled_p ())
6240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6241                              "shift permutation is not supported by target\n");
6242           return false;
6243         }
6244       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6245
6246       /* Generating permutation constant to shift all elements.
6247          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6248       for (i = 0; i < nelt; i++)
6249         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6250       indices.new_vector (sel, 2, nelt);
6251       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252         {
6253           if (dump_enabled_p ())
6254             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255                              "shift permutation is not supported by target\n");
6256           return false;
6257         }
6258       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6259
6260       /* Generating permutation constant to shift all elements.
6261          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6262       for (i = 0; i < nelt; i++)
6263         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6264       indices.new_vector (sel, 2, nelt);
6265       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266         {
6267           if (dump_enabled_p ())
6268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269                              "shift permutation is not supported by target\n");
6270           return false;
6271         }
6272       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6273
6274       for (k = 0; k < 3; k++)
6275         {
6276           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6277           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6278                                            dr_chain[k], dr_chain[k],
6279                                            perm3_mask);
6280           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6281           vect[k] = data_ref;
6282         }
6283
6284       for (k = 0; k < 3; k++)
6285         {
6286           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6287           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6288                                            vect[k % 3], vect[(k + 1) % 3],
6289                                            shift1_mask);
6290           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6291           vect_shift[k] = data_ref;
6292         }
6293
6294       for (k = 0; k < 3; k++)
6295         {
6296           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6297           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6298                                            vect_shift[(4 - k) % 3],
6299                                            vect_shift[(3 - k) % 3],
6300                                            shift2_mask);
6301           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6302           vect[k] = data_ref;
6303         }
6304
6305       (*result_chain)[3 - (nelt % 3)] = vect[2];
6306
6307       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6308       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6309                                        vect[0], shift3_mask);
6310       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6311       (*result_chain)[nelt % 3] = data_ref;
6312
6313       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6314       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6315                                        vect[1], shift4_mask);
6316       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6317       (*result_chain)[0] = data_ref;
6318       return true;
6319     }
6320   return false;
6321 }
6322
6323 /* Function vect_transform_grouped_load.
6324
6325    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6326    to perform their permutation and ascribe the result vectorized statements to
6327    the scalar statements.
6328 */
6329
6330 void
6331 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6332                              int size, gimple_stmt_iterator *gsi)
6333 {
6334   machine_mode mode;
6335   vec<tree> result_chain = vNULL;
6336
6337   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6338      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6339      vectors, that are ready for vector computation.  */
6340   result_chain.create (size);
6341
6342   /* If reassociation width for vector type is 2 or greater target machine can
6343      execute 2 or more vector instructions in parallel.  Otherwise try to
6344      get chain for loads group using vect_shift_permute_load_chain.  */
6345   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6346   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6347       || pow2p_hwi (size)
6348       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6349                                          gsi, &result_chain))
6350     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6351   vect_record_grouped_load_vectors (stmt_info, result_chain);
6352   result_chain.release ();
6353 }
6354
6355 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6356    generated as part of the vectorization of STMT_INFO.  Assign the statement
6357    for each vector to the associated scalar statement.  */
6358
6359 void
6360 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6361                                   vec<tree> result_chain)
6362 {
6363   vec_info *vinfo = stmt_info->vinfo;
6364   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6365   unsigned int i, gap_count;
6366   tree tmp_data_ref;
6367
6368   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6369      Since we scan the chain starting from it's first node, their order
6370      corresponds the order of data-refs in RESULT_CHAIN.  */
6371   stmt_vec_info next_stmt_info = first_stmt_info;
6372   gap_count = 1;
6373   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6374     {
6375       if (!next_stmt_info)
6376         break;
6377
6378       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6379        code elimination pass later.  No need to check for the first stmt in
6380        the group, since it always exists.
6381        DR_GROUP_GAP is the number of steps in elements from the previous
6382        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6383        correspond to the gaps.  */
6384       if (next_stmt_info != first_stmt_info
6385           && gap_count < DR_GROUP_GAP (next_stmt_info))
6386         {
6387           gap_count++;
6388           continue;
6389         }
6390
6391       /* ???  The following needs cleanup after the removal of
6392          DR_GROUP_SAME_DR_STMT.  */
6393       if (next_stmt_info)
6394         {
6395           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6396           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6397              copies, and we put the new vector statement in the first available
6398              RELATED_STMT.  */
6399           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6400             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6401           else
6402             {
6403               stmt_vec_info prev_stmt_info
6404                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6405               stmt_vec_info rel_stmt_info
6406                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6407               while (rel_stmt_info)
6408                 {
6409                   prev_stmt_info = rel_stmt_info;
6410                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6411                 }
6412
6413               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6414             }
6415
6416           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6417           gap_count = 1;
6418         }
6419     }
6420 }
6421
6422 /* Function vect_force_dr_alignment_p.
6423
6424    Returns whether the alignment of a DECL can be forced to be aligned
6425    on ALIGNMENT bit boundary.  */
6426
6427 bool
6428 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6429 {
6430   if (!VAR_P (decl))
6431     return false;
6432
6433   if (decl_in_symtab_p (decl)
6434       && !symtab_node::get (decl)->can_increase_alignment_p ())
6435     return false;
6436
6437   if (TREE_STATIC (decl))
6438     return (known_le (alignment,
6439                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6440   else
6441     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6442 }
6443
6444
6445 /* Return whether the data reference DR_INFO is supported with respect to its
6446    alignment.
6447    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6448    it is aligned, i.e., check if it is possible to vectorize it with different
6449    alignment.  */
6450
6451 enum dr_alignment_support
6452 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6453                                bool check_aligned_accesses)
6454 {
6455   data_reference *dr = dr_info->dr;
6456   stmt_vec_info stmt_info = dr_info->stmt;
6457   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6458   machine_mode mode = TYPE_MODE (vectype);
6459   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6460   struct loop *vect_loop = NULL;
6461   bool nested_in_vect_loop = false;
6462
6463   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6464     return dr_aligned;
6465
6466   /* For now assume all conditional loads/stores support unaligned
6467      access without any special code.  */
6468   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6469     if (gimple_call_internal_p (stmt)
6470         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6471             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6472       return dr_unaligned_supported;
6473
6474   if (loop_vinfo)
6475     {
6476       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6477       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6478     }
6479
6480   /* Possibly unaligned access.  */
6481
6482   /* We can choose between using the implicit realignment scheme (generating
6483      a misaligned_move stmt) and the explicit realignment scheme (generating
6484      aligned loads with a REALIGN_LOAD).  There are two variants to the
6485      explicit realignment scheme: optimized, and unoptimized.
6486      We can optimize the realignment only if the step between consecutive
6487      vector loads is equal to the vector size.  Since the vector memory
6488      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6489      is guaranteed that the misalignment amount remains the same throughout the
6490      execution of the vectorized loop.  Therefore, we can create the
6491      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6492      at the loop preheader.
6493
6494      However, in the case of outer-loop vectorization, when vectorizing a
6495      memory access in the inner-loop nested within the LOOP that is now being
6496      vectorized, while it is guaranteed that the misalignment of the
6497      vectorized memory access will remain the same in different outer-loop
6498      iterations, it is *not* guaranteed that is will remain the same throughout
6499      the execution of the inner-loop.  This is because the inner-loop advances
6500      with the original scalar step (and not in steps of VS).  If the inner-loop
6501      step happens to be a multiple of VS, then the misalignment remains fixed
6502      and we can use the optimized realignment scheme.  For example:
6503
6504       for (i=0; i<N; i++)
6505         for (j=0; j<M; j++)
6506           s += a[i+j];
6507
6508      When vectorizing the i-loop in the above example, the step between
6509      consecutive vector loads is 1, and so the misalignment does not remain
6510      fixed across the execution of the inner-loop, and the realignment cannot
6511      be optimized (as illustrated in the following pseudo vectorized loop):
6512
6513       for (i=0; i<N; i+=4)
6514         for (j=0; j<M; j++){
6515           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6516                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6517                          // (assuming that we start from an aligned address).
6518           }
6519
6520      We therefore have to use the unoptimized realignment scheme:
6521
6522       for (i=0; i<N; i+=4)
6523           for (j=k; j<M; j+=4)
6524           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6525                            // that the misalignment of the initial address is
6526                            // 0).
6527
6528      The loop can then be vectorized as follows:
6529
6530       for (k=0; k<4; k++){
6531         rt = get_realignment_token (&vp[k]);
6532         for (i=0; i<N; i+=4){
6533           v1 = vp[i+k];
6534           for (j=k; j<M; j+=4){
6535             v2 = vp[i+j+VS-1];
6536             va = REALIGN_LOAD <v1,v2,rt>;
6537             vs += va;
6538             v1 = v2;
6539           }
6540         }
6541     } */
6542
6543   if (DR_IS_READ (dr))
6544     {
6545       bool is_packed = false;
6546       tree type = (TREE_TYPE (DR_REF (dr)));
6547
6548       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6549           && (!targetm.vectorize.builtin_mask_for_load
6550               || targetm.vectorize.builtin_mask_for_load ()))
6551         {
6552           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6553
6554           /* If we are doing SLP then the accesses need not have the
6555              same alignment, instead it depends on the SLP group size.  */
6556           if (loop_vinfo
6557               && STMT_SLP_TYPE (stmt_info)
6558               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6559                               * (DR_GROUP_SIZE
6560                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6561                               TYPE_VECTOR_SUBPARTS (vectype)))
6562             ;
6563           else if (!loop_vinfo
6564                    || (nested_in_vect_loop
6565                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6566                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6567             return dr_explicit_realign;
6568           else
6569             return dr_explicit_realign_optimized;
6570         }
6571       if (!known_alignment_for_access_p (dr_info))
6572         is_packed = not_size_aligned (DR_REF (dr));
6573
6574       if (targetm.vectorize.support_vector_misalignment
6575             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6576         /* Can't software pipeline the loads, but can at least do them.  */
6577         return dr_unaligned_supported;
6578     }
6579   else
6580     {
6581       bool is_packed = false;
6582       tree type = (TREE_TYPE (DR_REF (dr)));
6583
6584       if (!known_alignment_for_access_p (dr_info))
6585         is_packed = not_size_aligned (DR_REF (dr));
6586
6587      if (targetm.vectorize.support_vector_misalignment
6588            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6589        return dr_unaligned_supported;
6590     }
6591
6592   /* Unsupported.  */
6593   return dr_unaligned_unsupported;
6594 }