gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 216   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 217 }
 218
 219 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 220    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 221    distances.  These distances are conservatively correct but they don't
 222    reflect a guaranteed dependence.
 223
 224    Return true if this function does all the work necessary to avoid
 225    an alias or false if the caller should use the dependence distances
 226    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 227    the depth of the loop described by LOOP_VINFO and the other arguments
 228    are as for vect_analyze_data_ref_dependence.  */
 229
 230 static bool
 231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 232                                        loop_vec_info loop_vinfo,
 233                                        int loop_depth, unsigned int *max_vf)
 234 {
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   lambda_vector dist_v;
 237   unsigned int i;
 238   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 239     {
 240       int dist = dist_v[loop_depth];
 241       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 242         {
 243           /* If the user asserted safelen >= DIST consecutive iterations
 244              can be executed concurrently, assume independence.
 245
 246              ??? An alternative would be to add the alias check even
 247              in this case, and vectorize the fallback loop with the
 248              maximum VF set to safelen.  However, if the user has
 249              explicitly given a length, it's less likely that that
 250              would be a win.  */
 251           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 252             {
 253               if ((unsigned int) loop->safelen < *max_vf)
 254                 *max_vf = loop->safelen;
 255               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 256               continue;
 257             }
 258
 259           /* For dependence distances of 2 or more, we have the option
 260              of limiting VF or checking for an alias at runtime.
 261              Prefer to check at runtime if we can, to avoid limiting
 262              the VF unnecessarily when the bases are in fact independent.
 263
 264              Note that the alias checks will be removed if the VF ends up
 265              being small enough.  */
 266           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 267         }
 268     }
 269   return true;
 270 }
 271
 272
 273 /* Function vect_analyze_data_ref_dependence.
 274
 275    Return TRUE if there (might) exist a dependence between a memory-reference
 276    DRA and a memory-reference DRB.  When versioning for alias may check a
 277    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 278    the data dependence.  */
 279
 280 static bool
 281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 282                                   loop_vec_info loop_vinfo,
 283                                   unsigned int *max_vf)
 284 {
 285   unsigned int i;
 286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 287   struct data_reference *dra = DDR_A (ddr);
 288   struct data_reference *drb = DDR_B (ddr);
 289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 291   lambda_vector dist_v;
 292   unsigned int loop_depth;
 293
 294   /* In loop analysis all data references should be vectorizable.  */
 295   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 296       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 297     gcc_unreachable ();
 298
 299   /* Independent data accesses.  */
 300   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 301     return false;
 302
 303   if (dra == drb
 304       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 305     return false;
 306
 307   /* We do not have to consider dependences between accesses that belong
 308      to the same group, unless the stride could be smaller than the
 309      group size.  */
 310   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 311       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 312           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 313       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 314     return false;
 315
 316   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 317      least two scalar iterations, there is always also a true dependence.
 318      As the vectorizer does not re-order loads and stores we can ignore
 319      the anti-dependence if TBAA can disambiguate both DRs similar to the
 320      case with known negative distance anti-dependences (positive
 321      distance anti-dependences would violate TBAA constraints).  */
 322   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 323        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 324       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 325                                  get_alias_set (DR_REF (drb))))
 326     return false;
 327
 328   /* Unknown data dependence.  */
 329   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 330     {
 331       /* If user asserted safelen consecutive iterations can be
 332          executed concurrently, assume independence.  */
 333       if (loop->safelen >= 2)
 334         {
 335           if ((unsigned int) loop->safelen < *max_vf)
 336             *max_vf = loop->safelen;
 337           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 338           return false;
 339         }
 340
 341       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 342           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 343         {
 344           if (dump_enabled_p ())
 345             {
 346               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 347                                "versioning for alias not supported for: "
 348                                "can't determine dependence between ");
 349               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 350                                  DR_REF (dra));
 351               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 352               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 353                                  DR_REF (drb));
 354               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 355             }
 356           return true;
 357         }
 358
 359       if (dump_enabled_p ())
 360         {
 361           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 362                            "versioning for alias required: "
 363                            "can't determine dependence between ");
 364           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 365                              DR_REF (dra));
 366           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 367           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 368                              DR_REF (drb));
 369           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 370         }
 371
 372       /* Add to list of ddrs that need to be tested at run-time.  */
 373       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 374     }
 375
 376   /* Known data dependence.  */
 377   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 378     {
 379       /* If user asserted safelen consecutive iterations can be
 380          executed concurrently, assume independence.  */
 381       if (loop->safelen >= 2)
 382         {
 383           if ((unsigned int) loop->safelen < *max_vf)
 384             *max_vf = loop->safelen;
 385           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 386           return false;
 387         }
 388
 389       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 390           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 391         {
 392           if (dump_enabled_p ())
 393             {
 394               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 395                                "versioning for alias not supported for: "
 396                                "bad dist vector for ");
 397               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 398                                  DR_REF (dra));
 399               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 400               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 401                                  DR_REF (drb));
 402               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 403             }
 404           return true;
 405         }
 406
 407       if (dump_enabled_p ())
 408         {
 409           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                            "versioning for alias required: "
 411                            "bad dist vector for ");
 412           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 413           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 414           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 415           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 416         }
 417       /* Add to list of ddrs that need to be tested at run-time.  */
 418       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 419     }
 420
 421   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 422
 423   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 424       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 425                                                 loop_depth, max_vf))
 426     return false;
 427
 428   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 429     {
 430       int dist = dist_v[loop_depth];
 431
 432       if (dump_enabled_p ())
 433         dump_printf_loc (MSG_NOTE, vect_location,
 434                          "dependence distance  = %d.\n", dist);
 435
 436       if (dist == 0)
 437         {
 438           if (dump_enabled_p ())
 439             {
 440               dump_printf_loc (MSG_NOTE, vect_location,
 441                                "dependence distance == 0 between ");
 442               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 443               dump_printf (MSG_NOTE, " and ");
 444               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 445               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 446             }
 447
 448           /* When we perform grouped accesses and perform implicit CSE
 449              by detecting equal accesses and doing disambiguation with
 450              runtime alias tests like for
 451                 .. = a[i];
 452                 .. = a[i+1];
 453                 a[i] = ..;
 454                 a[i+1] = ..;
 455                 *p = ..;
 456                 .. = a[i];
 457                 .. = a[i+1];
 458              where we will end up loading { a[i], a[i+1] } once, make
 459              sure that inserting group loads before the first load and
 460              stores after the last store will do the right thing.
 461              Similar for groups like
 462                 a[i] = ...;
 463                 ... = a[i];
 464                 a[i+1] = ...;
 465              where loads from the group interleave with the store.  */
 466           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 467             {
 468               if (dump_enabled_p ())
 469                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 470                                  "READ_WRITE dependence in interleaving.\n");
 471               return true;
 472             }
 473
 474           if (loop->safelen < 2)
 475             {
 476               tree indicator = dr_zero_step_indicator (dra);
 477               if (TREE_CODE (indicator) != INTEGER_CST)
 478                 vect_check_nonzero_value (loop_vinfo, indicator);
 479               else if (integer_zerop (indicator))
 480                 {
 481                   if (dump_enabled_p ())
 482                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 483                                  "access also has a zero step\n");
 484                   return true;
 485                 }
 486             }
 487           continue;
 488         }
 489
 490       if (dist > 0 && DDR_REVERSED_P (ddr))
 491         {
 492           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 493              reversed (to make distance vector positive), and the actual
 494              distance is negative.  */
 495           if (dump_enabled_p ())
 496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 497                              "dependence distance negative.\n");
 498           /* Record a negative dependence distance to later limit the
 499              amount of stmt copying / unrolling we can perform.
 500              Only need to handle read-after-write dependence.  */
 501           if (DR_IS_READ (drb)
 502               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 503                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 504             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 505           continue;
 506         }
 507
 508       unsigned int abs_dist = abs (dist);
 509       if (abs_dist >= 2 && abs_dist < *max_vf)
 510         {
 511           /* The dependence distance requires reduction of the maximal
 512              vectorization factor.  */
 513           *max_vf = abs (dist);
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location,
 516                              "adjusting maximal vectorization factor to %i\n",
 517                              *max_vf);
 518         }
 519
 520       if (abs_dist >= *max_vf)
 521         {
 522           /* Dependence distance does not create dependence, as far as
 523              vectorization is concerned, in this case.  */
 524           if (dump_enabled_p ())
 525             dump_printf_loc (MSG_NOTE, vect_location,
 526                              "dependence distance >= VF.\n");
 527           continue;
 528         }
 529
 530       if (dump_enabled_p ())
 531         {
 532           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 533                        "not vectorized, possible dependence "
 534                        "between data-refs ");
 535           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 536           dump_printf (MSG_NOTE,  " and ");
 537           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 538           dump_printf (MSG_NOTE,  "\n");
 539         }
 540
 541       return true;
 542     }
 543
 544   return false;
 545 }
 546
 547 /* Function vect_analyze_data_ref_dependences.
 548
 549    Examine all the data references in the loop, and make sure there do not
 550    exist any data dependences between them.  Set *MAX_VF according to
 551    the maximum vectorization factor the data dependences allow.  */
 552
 553 bool
 554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 555                                    unsigned int *max_vf)
 556 {
 557   unsigned int i;
 558   struct data_dependence_relation *ddr;
 559
 560   if (dump_enabled_p ())
 561     dump_printf_loc (MSG_NOTE, vect_location,
 562                      "=== vect_analyze_data_ref_dependences ===\n");
 563
 564   LOOP_VINFO_DDRS (loop_vinfo)
 565     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 566              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 567   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 568   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 569   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 570                                 &LOOP_VINFO_DDRS (loop_vinfo),
 571                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 572     return false;
 573
 574   /* For epilogues we either have no aliases or alias versioning
 575      was applied to original loop.  Therefore we may just get max_vf
 576      using VF of original loop.  */
 577   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 578     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 579   else
 580     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 581       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 582         return false;
 583
 584   return true;
 585 }
 586
 587
 588 /* Function vect_slp_analyze_data_ref_dependence.
 589
 590    Return TRUE if there (might) exist a dependence between a memory-reference
 591    DRA and a memory-reference DRB.  When versioning for alias may check a
 592    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 593    the data dependence.  */
 594
 595 static bool
 596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 597 {
 598   struct data_reference *dra = DDR_A (ddr);
 599   struct data_reference *drb = DDR_B (ddr);
 600
 601   /* We need to check dependences of statements marked as unvectorizable
 602      as well, they still can prohibit vectorization.  */
 603
 604   /* Independent data accesses.  */
 605   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 606     return false;
 607
 608   if (dra == drb)
 609     return false;
 610
 611   /* Read-read is OK.  */
 612   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 613     return false;
 614
 615   /* If dra and drb are part of the same interleaving chain consider
 616      them independent.  */
 617   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 618       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 619           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 620     return false;
 621
 622   /* Unknown data dependence.  */
 623   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 624     {
 625       if  (dump_enabled_p ())
 626         {
 627           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 628                            "can't determine dependence between ");
 629           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 630           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 631           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 632           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 633         }
 634     }
 635   else if (dump_enabled_p ())
 636     {
 637       dump_printf_loc (MSG_NOTE, vect_location,
 638                        "determined dependence between ");
 639       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 640       dump_printf (MSG_NOTE, " and ");
 641       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 642       dump_printf (MSG_NOTE,  "\n");
 643     }
 644
 645   return true;
 646 }
 647
 648
 649 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 650    contain the vector of scalar stores of this instance if we are
 651    disambiguating the loads.  */
 652
 653 static bool
 654 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 655                                    vec<gimple *> stores, gimple *last_store)
 656 {
 657   /* This walks over all stmts involved in the SLP load/store done
 658      in NODE verifying we can sink them up to the last stmt in the
 659      group.  */
 660   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 661   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 662     {
 663       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 664       if (access == last_access)
 665         continue;
 666       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 667       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 668            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 669         {
 670           gimple *stmt = gsi_stmt (gsi);
 671           if (! gimple_vuse (stmt)
 672               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 673             continue;
 674
 675           /* If we couldn't record a (single) data reference for this
 676              stmt we have to give up.  */
 677           /* ???  Here and below if dependence analysis fails we can resort
 678              to the alias oracle which can handle more kinds of stmts.  */
 679           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 680           if (!dr_b)
 681             return false;
 682
 683           bool dependent = false;
 684           /* If we run into a store of this same instance (we've just
 685              marked those) then delay dependence checking until we run
 686              into the last store because this is where it will have
 687              been sunk to (and we verify if we can do that as well).  */
 688           if (gimple_visited_p (stmt))
 689             {
 690               if (stmt != last_store)
 691                 continue;
 692               unsigned i;
 693               gimple *store;
 694               FOR_EACH_VEC_ELT (stores, i, store)
 695                 {
 696                   data_reference *store_dr
 697                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 698                   ddr_p ddr = initialize_data_dependence_relation
 699                                 (dr_a, store_dr, vNULL);
 700                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 701                   free_dependence_relation (ddr);
 702                   if (dependent)
 703                     break;
 704                 }
 705             }
 706           else
 707             {
 708               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 709                                                                dr_b, vNULL);
 710               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 711               free_dependence_relation (ddr);
 712             }
 713           if (dependent)
 714             return false;
 715         }
 716     }
 717   return true;
 718 }
 719
 720
 721 /* Function vect_analyze_data_ref_dependences.
 722
 723    Examine all the data references in the basic-block, and make sure there
 724    do not exist any data dependences between them.  Set *MAX_VF according to
 725    the maximum vectorization factor the data dependences allow.  */
 726
 727 bool
 728 vect_slp_analyze_instance_dependence (slp_instance instance)
 729 {
 730   if (dump_enabled_p ())
 731     dump_printf_loc (MSG_NOTE, vect_location,
 732                      "=== vect_slp_analyze_instance_dependence ===\n");
 733
 734   /* The stores of this instance are at the root of the SLP tree.  */
 735   slp_tree store = SLP_INSTANCE_TREE (instance);
 736   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 737     store = NULL;
 738
 739   /* Verify we can sink stores to the vectorized stmt insert location.  */
 740   gimple *last_store = NULL;
 741   if (store)
 742     {
 743       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 744         return false;
 745
 746       /* Mark stores in this instance and remember the last one.  */
 747       last_store = vect_find_last_scalar_stmt_in_slp (store);
 748       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 749         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 750     }
 751
 752   bool res = true;
 753
 754   /* Verify we can sink loads to the vectorized stmt insert location,
 755      special-casing stores of this instance.  */
 756   slp_tree load;
 757   unsigned int i;
 758   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 759     if (! vect_slp_analyze_node_dependences (instance, load,
 760                                              store
 761                                              ? SLP_TREE_SCALAR_STMTS (store)
 762                                              : vNULL, last_store))
 763       {
 764         res = false;
 765         break;
 766       }
 767
 768   /* Unset the visited flag.  */
 769   if (store)
 770     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 771       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 772
 773   return res;
 774 }
 775
 776 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 777    the statement that contains DRB, which is useful for recording in the
 778    dump file.  */
 779
 780 static void
 781 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 782                             innermost_loop_behavior *drb)
 783 {
 784   bool existed;
 785   innermost_loop_behavior *&entry
 786     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 787   if (!existed || entry->base_alignment < drb->base_alignment)
 788     {
 789       entry = drb;
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location,
 793                            "recording new base alignment for ");
 794           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 795           dump_printf (MSG_NOTE, "\n");
 796           dump_printf_loc (MSG_NOTE, vect_location,
 797                            "  alignment:    %d\n", drb->base_alignment);
 798           dump_printf_loc (MSG_NOTE, vect_location,
 799                            "  misalignment: %d\n", drb->base_misalignment);
 800           dump_printf_loc (MSG_NOTE, vect_location,
 801                            "  based on:     ");
 802           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 803         }
 804     }
 805 }
 806
 807 /* If the region we're going to vectorize is reached, all unconditional
 808    data references occur at least once.  We can therefore pool the base
 809    alignment guarantees from each unconditional reference.  Do this by
 810    going through all the data references in VINFO and checking whether
 811    the containing statement makes the reference unconditionally.  If so,
 812    record the alignment of the base address in VINFO so that it can be
 813    used for all other references with the same base.  */
 814
 815 void
 816 vect_record_base_alignments (vec_info *vinfo)
 817 {
 818   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 819   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 820   data_reference *dr;
 821   unsigned int i;
 822   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 823     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 824       {
 825         gimple *stmt = DR_STMT (dr);
 826         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 827
 828         /* If DR is nested in the loop that is being vectorized, we can also
 829            record the alignment of the base wrt the outer loop.  */
 830         if (loop && nested_in_vect_loop_p (loop, stmt))
 831           {
 832             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 833             vect_record_base_alignment
 834               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 835           }
 836       }
 837 }
 838
 839 /* Return the target alignment for the vectorized form of DR.  */
 840
 841 static unsigned int
 842 vect_calculate_target_alignment (struct data_reference *dr)
 843 {
 844   gimple *stmt = DR_STMT (dr);
 845   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 846   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 847   return targetm.vectorize.preferred_vector_alignment (vectype);
 848 }
 849
 850 /* Function vect_compute_data_ref_alignment
 851
 852    Compute the misalignment of the data reference DR.
 853
 854    Output:
 855    1. If during the misalignment computation it is found that the data reference
 856       cannot be vectorized then false is returned.
 857    2. DR_MISALIGNMENT (DR) is defined.
 858
 859    FOR NOW: No analysis is actually performed. Misalignment is calculated
 860    only for trivial cases. TODO.  */
 861
 862 bool
 863 vect_compute_data_ref_alignment (struct data_reference *dr)
 864 {
 865   gimple *stmt = DR_STMT (dr);
 866   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 867   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 868   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 869   struct loop *loop = NULL;
 870   tree ref = DR_REF (dr);
 871   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 872
 873   if (dump_enabled_p ())
 874     dump_printf_loc (MSG_NOTE, vect_location,
 875                      "vect_compute_data_ref_alignment:\n");
 876
 877   if (loop_vinfo)
 878     loop = LOOP_VINFO_LOOP (loop_vinfo);
 879
 880   /* Initialize misalignment to unknown.  */
 881   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 882
 883   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 884   bool step_preserves_misalignment_p;
 885
 886   unsigned HOST_WIDE_INT vector_alignment
 887     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 888   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 889
 890   /* No step for BB vectorization.  */
 891   if (!loop)
 892     {
 893       gcc_assert (integer_zerop (drb->step));
 894       step_preserves_misalignment_p = true;
 895     }
 896
 897   /* In case the dataref is in an inner-loop of the loop that is being
 898      vectorized (LOOP), we use the base and misalignment information
 899      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 900      stays the same throughout the execution of the inner-loop, which is why
 901      we have to check that the stride of the dataref in the inner-loop evenly
 902      divides by the vector alignment.  */
 903   else if (nested_in_vect_loop_p (loop, stmt))
 904     {
 905       step_preserves_misalignment_p
 906         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 907
 908       if (dump_enabled_p ())
 909         {
 910           if (step_preserves_misalignment_p)
 911             dump_printf_loc (MSG_NOTE, vect_location,
 912                              "inner step divides the vector alignment.\n");
 913           else
 914             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                              "inner step doesn't divide the vector"
 916                              " alignment.\n");
 917         }
 918     }
 919
 920   /* Similarly we can only use base and misalignment information relative to
 921      an innermost loop if the misalignment stays the same throughout the
 922      execution of the loop.  As above, this is the case if the stride of
 923      the dataref evenly divides by the alignment.  */
 924   else
 925     {
 926       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 927       step_preserves_misalignment_p
 928         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 929
 930       if (!step_preserves_misalignment_p && dump_enabled_p ())
 931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                          "step doesn't divide the vector alignment.\n");
 933     }
 934
 935   unsigned int base_alignment = drb->base_alignment;
 936   unsigned int base_misalignment = drb->base_misalignment;
 937
 938   /* Calculate the maximum of the pooled base address alignment and the
 939      alignment that we can compute for DR itself.  */
 940   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 941   if (entry && base_alignment < (*entry)->base_alignment)
 942     {
 943       base_alignment = (*entry)->base_alignment;
 944       base_misalignment = (*entry)->base_misalignment;
 945     }
 946
 947   if (drb->offset_alignment < vector_alignment
 948       || !step_preserves_misalignment_p
 949       /* We need to know whether the step wrt the vectorized loop is
 950          negative when computing the starting misalignment below.  */
 951       || TREE_CODE (drb->step) != INTEGER_CST)
 952     {
 953       if (dump_enabled_p ())
 954         {
 955           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 956                            "Unknown alignment for access: ");
 957           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 958           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 959         }
 960       return true;
 961     }
 962
 963   if (base_alignment < vector_alignment)
 964     {
 965       unsigned int max_alignment;
 966       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 967       if (max_alignment < vector_alignment
 968           || !vect_can_force_dr_alignment_p (base,
 969                                              vector_alignment * BITS_PER_UNIT))
 970         {
 971           if (dump_enabled_p ())
 972             {
 973               dump_printf_loc (MSG_NOTE, vect_location,
 974                                "can't force alignment of ref: ");
 975               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 976               dump_printf (MSG_NOTE, "\n");
 977             }
 978           return true;
 979         }
 980
 981       /* Force the alignment of the decl.
 982          NOTE: This is the only change to the code we make during
 983          the analysis phase, before deciding to vectorize the loop.  */
 984       if (dump_enabled_p ())
 985         {
 986           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 987           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 988           dump_printf (MSG_NOTE, "\n");
 989         }
 990
 991       DR_VECT_AUX (dr)->base_decl = base;
 992       DR_VECT_AUX (dr)->base_misaligned = true;
 993       base_misalignment = 0;
 994     }
 995   poly_int64 misalignment
 996     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 997
 998   /* If this is a backward running DR then first access in the larger
 999      vectype actually is N-1 elements before the address in the DR.
1000      Adjust misalign accordingly.  */
1001   if (tree_int_cst_sgn (drb->step) < 0)
1002     /* PLUS because STEP is negative.  */
1003     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1004                      * TREE_INT_CST_LOW (drb->step));
1005
1006   unsigned int const_misalignment;
1007   if (!known_misalignment (misalignment, vector_alignment,
1008                            &const_misalignment))
1009     {
1010       if (dump_enabled_p ())
1011         {
1012           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013                            "Non-constant misalignment for access: ");
1014           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1015           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1016         }
1017       return true;
1018     }
1019
1020   SET_DR_MISALIGNMENT (dr, const_misalignment);
1021
1022   if (dump_enabled_p ())
1023     {
1024       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1026       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1027       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1028     }
1029
1030   return true;
1031 }
1032
1033 /* Function vect_update_misalignment_for_peel.
1034    Sets DR's misalignment
1035    - to 0 if it has the same alignment as DR_PEEL,
1036    - to the misalignment computed using NPEEL if DR's salignment is known,
1037    - to -1 (unknown) otherwise.
1038
1039    DR - the data reference whose misalignment is to be adjusted.
1040    DR_PEEL - the data reference whose misalignment is being made
1041              zero in the vector loop by the peel.
1042    NPEEL - the number of iterations in the peel loop if the misalignment
1043            of DR_PEEL is known at compile time.  */
1044
1045 static void
1046 vect_update_misalignment_for_peel (struct data_reference *dr,
1047                                    struct data_reference *dr_peel, int npeel)
1048 {
1049   unsigned int i;
1050   vec<dr_p> same_aligned_drs;
1051   struct data_reference *current_dr;
1052   int dr_size = vect_get_scalar_dr_size (dr);
1053   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1054   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1055   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1056
1057  /* For interleaved data accesses the step in the loop must be multiplied by
1058      the size of the interleaving group.  */
1059   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1060     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1061   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1062     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1063
1064   /* It can be assumed that the data refs with the same alignment as dr_peel
1065      are aligned in the vector loop.  */
1066   same_aligned_drs
1067     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1068   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1069     {
1070       if (current_dr != dr)
1071         continue;
1072       gcc_assert (!known_alignment_for_access_p (dr)
1073                   || !known_alignment_for_access_p (dr_peel)
1074                   || (DR_MISALIGNMENT (dr) / dr_size
1075                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1076       SET_DR_MISALIGNMENT (dr, 0);
1077       return;
1078     }
1079
1080   if (known_alignment_for_access_p (dr)
1081       && known_alignment_for_access_p (dr_peel))
1082     {
1083       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1084       int misal = DR_MISALIGNMENT (dr);
1085       misal += negative ? -npeel * dr_size : npeel * dr_size;
1086       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1087       SET_DR_MISALIGNMENT (dr, misal);
1088       return;
1089     }
1090
1091   if (dump_enabled_p ())
1092     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1093                      "to unknown (-1).\n");
1094   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1095 }
1096
1097
1098 /* Function verify_data_ref_alignment
1099
1100    Return TRUE if DR can be handled with respect to alignment.  */
1101
1102 static bool
1103 verify_data_ref_alignment (data_reference_p dr)
1104 {
1105   enum dr_alignment_support supportable_dr_alignment
1106     = vect_supportable_dr_alignment (dr, false);
1107   if (!supportable_dr_alignment)
1108     {
1109       if (dump_enabled_p ())
1110         {
1111           if (DR_IS_READ (dr))
1112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113                              "not vectorized: unsupported unaligned load.");
1114           else
1115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1116                              "not vectorized: unsupported unaligned "
1117                              "store.");
1118
1119           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1120                              DR_REF (dr));
1121           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1122         }
1123       return false;
1124     }
1125
1126   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1127     dump_printf_loc (MSG_NOTE, vect_location,
1128                      "Vectorizing an unaligned access.\n");
1129
1130   return true;
1131 }
1132
1133 /* Function vect_verify_datarefs_alignment
1134
1135    Return TRUE if all data references in the loop can be
1136    handled with respect to alignment.  */
1137
1138 bool
1139 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1140 {
1141   vec<data_reference_p> datarefs = vinfo->datarefs;
1142   struct data_reference *dr;
1143   unsigned int i;
1144
1145   FOR_EACH_VEC_ELT (datarefs, i, dr)
1146     {
1147       gimple *stmt = DR_STMT (dr);
1148       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149
1150       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1151         continue;
1152
1153       /* For interleaving, only the alignment of the first access matters.   */
1154       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1156         continue;
1157
1158       /* Strided accesses perform only component accesses, alignment is
1159          irrelevant for them.  */
1160       if (STMT_VINFO_STRIDED_P (stmt_info)
1161           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1162         continue;
1163
1164       if (! verify_data_ref_alignment (dr))
1165         return false;
1166     }
1167
1168   return true;
1169 }
1170
1171 /* Given an memory reference EXP return whether its alignment is less
1172    than its size.  */
1173
1174 static bool
1175 not_size_aligned (tree exp)
1176 {
1177   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1178     return true;
1179
1180   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1181           > get_object_alignment (exp));
1182 }
1183
1184 /* Function vector_alignment_reachable_p
1185
1186    Return true if vector alignment for DR is reachable by peeling
1187    a few loop iterations.  Return false otherwise.  */
1188
1189 static bool
1190 vector_alignment_reachable_p (struct data_reference *dr)
1191 {
1192   gimple *stmt = DR_STMT (dr);
1193   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1194   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1195
1196   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1197     {
1198       /* For interleaved access we peel only if number of iterations in
1199          the prolog loop ({VF - misalignment}), is a multiple of the
1200          number of the interleaved accesses.  */
1201       int elem_size, mis_in_elements;
1202
1203       /* FORNOW: handle only known alignment.  */
1204       if (!known_alignment_for_access_p (dr))
1205         return false;
1206
1207       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1208       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1209       elem_size = vector_element_size (vector_size, nelements);
1210       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1211
1212       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1213         return false;
1214     }
1215
1216   /* If misalignment is known at the compile time then allow peeling
1217      only if natural alignment is reachable through peeling.  */
1218   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1219     {
1220       HOST_WIDE_INT elmsize =
1221                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1222       if (dump_enabled_p ())
1223         {
1224           dump_printf_loc (MSG_NOTE, vect_location,
1225                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1226           dump_printf (MSG_NOTE,
1227                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1228         }
1229       if (DR_MISALIGNMENT (dr) % elmsize)
1230         {
1231           if (dump_enabled_p ())
1232             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233                              "data size does not divide the misalignment.\n");
1234           return false;
1235         }
1236     }
1237
1238   if (!known_alignment_for_access_p (dr))
1239     {
1240       tree type = TREE_TYPE (DR_REF (dr));
1241       bool is_packed = not_size_aligned (DR_REF (dr));
1242       if (dump_enabled_p ())
1243         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244                          "Unknown misalignment, %snaturally aligned\n",
1245                          is_packed ? "not " : "");
1246       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1247     }
1248
1249   return true;
1250 }
1251
1252
1253 /* Calculate the cost of the memory access represented by DR.  */
1254
1255 static void
1256 vect_get_data_access_cost (struct data_reference *dr,
1257                            unsigned int *inside_cost,
1258                            unsigned int *outside_cost,
1259                            stmt_vector_for_cost *body_cost_vec,
1260                            stmt_vector_for_cost *prologue_cost_vec)
1261 {
1262   gimple *stmt = DR_STMT (dr);
1263   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1264   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1265   int ncopies;
1266
1267   if (PURE_SLP_STMT (stmt_info))
1268     ncopies = 1;
1269   else
1270     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1271
1272   if (DR_IS_READ (dr))
1273     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1274                         prologue_cost_vec, body_cost_vec, false);
1275   else
1276     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1277
1278   if (dump_enabled_p ())
1279     dump_printf_loc (MSG_NOTE, vect_location,
1280                      "vect_get_data_access_cost: inside_cost = %d, "
1281                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1282 }
1283
1284
1285 typedef struct _vect_peel_info
1286 {
1287   struct data_reference *dr;
1288   int npeel;
1289   unsigned int count;
1290 } *vect_peel_info;
1291
1292 typedef struct _vect_peel_extended_info
1293 {
1294   struct _vect_peel_info peel_info;
1295   unsigned int inside_cost;
1296   unsigned int outside_cost;
1297 } *vect_peel_extended_info;
1298
1299
1300 /* Peeling hashtable helpers.  */
1301
1302 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1303 {
1304   static inline hashval_t hash (const _vect_peel_info *);
1305   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1306 };
1307
1308 inline hashval_t
1309 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1310 {
1311   return (hashval_t) peel_info->npeel;
1312 }
1313
1314 inline bool
1315 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1316 {
1317   return (a->npeel == b->npeel);
1318 }
1319
1320
1321 /* Insert DR into peeling hash table with NPEEL as key.  */
1322
1323 static void
1324 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1325                           loop_vec_info loop_vinfo, struct data_reference *dr,
1326                           int npeel)
1327 {
1328   struct _vect_peel_info elem, *slot;
1329   _vect_peel_info **new_slot;
1330   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1331
1332   elem.npeel = npeel;
1333   slot = peeling_htab->find (&elem);
1334   if (slot)
1335     slot->count++;
1336   else
1337     {
1338       slot = XNEW (struct _vect_peel_info);
1339       slot->npeel = npeel;
1340       slot->dr = dr;
1341       slot->count = 1;
1342       new_slot = peeling_htab->find_slot (slot, INSERT);
1343       *new_slot = slot;
1344     }
1345
1346   if (!supportable_dr_alignment
1347       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1348     slot->count += VECT_MAX_COST;
1349 }
1350
1351
1352 /* Traverse peeling hash table to find peeling option that aligns maximum
1353    number of data accesses.  */
1354
1355 int
1356 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1357                                      _vect_peel_extended_info *max)
1358 {
1359   vect_peel_info elem = *slot;
1360
1361   if (elem->count > max->peel_info.count
1362       || (elem->count == max->peel_info.count
1363           && max->peel_info.npeel > elem->npeel))
1364     {
1365       max->peel_info.npeel = elem->npeel;
1366       max->peel_info.count = elem->count;
1367       max->peel_info.dr = elem->dr;
1368     }
1369
1370   return 1;
1371 }
1372
1373 /* Get the costs of peeling NPEEL iterations checking data access costs
1374    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1375    misalignment will be zero after peeling.  */
1376
1377 static void
1378 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1379                                 struct data_reference *dr0,
1380                                 unsigned int *inside_cost,
1381                                 unsigned int *outside_cost,
1382                                 stmt_vector_for_cost *body_cost_vec,
1383                                 stmt_vector_for_cost *prologue_cost_vec,
1384                                 unsigned int npeel,
1385                                 bool unknown_misalignment)
1386 {
1387   unsigned i;
1388   data_reference *dr;
1389
1390   FOR_EACH_VEC_ELT (datarefs, i, dr)
1391     {
1392       gimple *stmt = DR_STMT (dr);
1393       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1394       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1395         continue;
1396
1397       /* For interleaving, only the alignment of the first access
1398          matters.  */
1399       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1400           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1401         continue;
1402
1403       /* Strided accesses perform only component accesses, alignment is
1404          irrelevant for them.  */
1405       if (STMT_VINFO_STRIDED_P (stmt_info)
1406           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1407         continue;
1408
1409       int save_misalignment;
1410       save_misalignment = DR_MISALIGNMENT (dr);
1411       if (npeel == 0)
1412         ;
1413       else if (unknown_misalignment && dr == dr0)
1414         SET_DR_MISALIGNMENT (dr, 0);
1415       else
1416         vect_update_misalignment_for_peel (dr, dr0, npeel);
1417       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1418                                  body_cost_vec, prologue_cost_vec);
1419       SET_DR_MISALIGNMENT (dr, save_misalignment);
1420     }
1421 }
1422
1423 /* Traverse peeling hash table and calculate cost for each peeling option.
1424    Find the one with the lowest cost.  */
1425
1426 int
1427 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1428                                    _vect_peel_extended_info *min)
1429 {
1430   vect_peel_info elem = *slot;
1431   int dummy;
1432   unsigned int inside_cost = 0, outside_cost = 0;
1433   gimple *stmt = DR_STMT (elem->dr);
1434   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1435   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1436   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1437                        epilogue_cost_vec;
1438
1439   prologue_cost_vec.create (2);
1440   body_cost_vec.create (2);
1441   epilogue_cost_vec.create (2);
1442
1443   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1444                                   elem->dr, &inside_cost, &outside_cost,
1445                                   &body_cost_vec, &prologue_cost_vec,
1446                                   elem->npeel, false);
1447
1448   body_cost_vec.release ();
1449
1450   outside_cost += vect_get_known_peeling_cost
1451     (loop_vinfo, elem->npeel, &dummy,
1452      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1453      &prologue_cost_vec, &epilogue_cost_vec);
1454
1455   /* Prologue and epilogue costs are added to the target model later.
1456      These costs depend only on the scalar iteration cost, the
1457      number of peeling iterations finally chosen, and the number of
1458      misaligned statements.  So discard the information found here.  */
1459   prologue_cost_vec.release ();
1460   epilogue_cost_vec.release ();
1461
1462   if (inside_cost < min->inside_cost
1463       || (inside_cost == min->inside_cost
1464           && outside_cost < min->outside_cost))
1465     {
1466       min->inside_cost = inside_cost;
1467       min->outside_cost = outside_cost;
1468       min->peel_info.dr = elem->dr;
1469       min->peel_info.npeel = elem->npeel;
1470       min->peel_info.count = elem->count;
1471     }
1472
1473   return 1;
1474 }
1475
1476
1477 /* Choose best peeling option by traversing peeling hash table and either
1478    choosing an option with the lowest cost (if cost model is enabled) or the
1479    option that aligns as many accesses as possible.  */
1480
1481 static struct _vect_peel_extended_info
1482 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1483                                        loop_vec_info loop_vinfo)
1484 {
1485    struct _vect_peel_extended_info res;
1486
1487    res.peel_info.dr = NULL;
1488
1489    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1490      {
1491        res.inside_cost = INT_MAX;
1492        res.outside_cost = INT_MAX;
1493        peeling_htab->traverse <_vect_peel_extended_info *,
1494                                vect_peeling_hash_get_lowest_cost> (&res);
1495      }
1496    else
1497      {
1498        res.peel_info.count = 0;
1499        peeling_htab->traverse <_vect_peel_extended_info *,
1500                                vect_peeling_hash_get_most_frequent> (&res);
1501        res.inside_cost = 0;
1502        res.outside_cost = 0;
1503      }
1504
1505    return res;
1506 }
1507
1508 /* Return true if the new peeling NPEEL is supported.  */
1509
1510 static bool
1511 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1512                           unsigned npeel)
1513 {
1514   unsigned i;
1515   struct data_reference *dr = NULL;
1516   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1517   gimple *stmt;
1518   stmt_vec_info stmt_info;
1519   enum dr_alignment_support supportable_dr_alignment;
1520
1521   /* Ensure that all data refs can be vectorized after the peel.  */
1522   FOR_EACH_VEC_ELT (datarefs, i, dr)
1523     {
1524       int save_misalignment;
1525
1526       if (dr == dr0)
1527         continue;
1528
1529       stmt = DR_STMT (dr);
1530       stmt_info = vinfo_for_stmt (stmt);
1531       /* For interleaving, only the alignment of the first access
1532          matters.  */
1533       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1534           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1535         continue;
1536
1537       /* Strided accesses perform only component accesses, alignment is
1538          irrelevant for them.  */
1539       if (STMT_VINFO_STRIDED_P (stmt_info)
1540           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1541         continue;
1542
1543       save_misalignment = DR_MISALIGNMENT (dr);
1544       vect_update_misalignment_for_peel (dr, dr0, npeel);
1545       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1546       SET_DR_MISALIGNMENT (dr, save_misalignment);
1547
1548       if (!supportable_dr_alignment)
1549         return false;
1550     }
1551
1552   return true;
1553 }
1554
1555 /* Function vect_enhance_data_refs_alignment
1556
1557    This pass will use loop versioning and loop peeling in order to enhance
1558    the alignment of data references in the loop.
1559
1560    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1561    original loop is to be vectorized.  Any other loops that are created by
1562    the transformations performed in this pass - are not supposed to be
1563    vectorized.  This restriction will be relaxed.
1564
1565    This pass will require a cost model to guide it whether to apply peeling
1566    or versioning or a combination of the two.  For example, the scheme that
1567    intel uses when given a loop with several memory accesses, is as follows:
1568    choose one memory access ('p') which alignment you want to force by doing
1569    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1570    other accesses are not necessarily aligned, or (2) use loop versioning to
1571    generate one loop in which all accesses are aligned, and another loop in
1572    which only 'p' is necessarily aligned.
1573
1574    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1575    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1576    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1577
1578    Devising a cost model is the most critical aspect of this work.  It will
1579    guide us on which access to peel for, whether to use loop versioning, how
1580    many versions to create, etc.  The cost model will probably consist of
1581    generic considerations as well as target specific considerations (on
1582    powerpc for example, misaligned stores are more painful than misaligned
1583    loads).
1584
1585    Here are the general steps involved in alignment enhancements:
1586
1587      -- original loop, before alignment analysis:
1588         for (i=0; i<N; i++){
1589           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1590           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1591         }
1592
1593      -- After vect_compute_data_refs_alignment:
1594         for (i=0; i<N; i++){
1595           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1596           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1597         }
1598
1599      -- Possibility 1: we do loop versioning:
1600      if (p is aligned) {
1601         for (i=0; i<N; i++){    # loop 1A
1602           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1603           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1604         }
1605      }
1606      else {
1607         for (i=0; i<N; i++){    # loop 1B
1608           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1609           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1610         }
1611      }
1612
1613      -- Possibility 2: we do loop peeling:
1614      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1615         x = q[i];
1616         p[i] = y;
1617      }
1618      for (i = 3; i < N; i++){   # loop 2A
1619         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1620         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1621      }
1622
1623      -- Possibility 3: combination of loop peeling and versioning:
1624      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1625         x = q[i];
1626         p[i] = y;
1627      }
1628      if (p is aligned) {
1629         for (i = 3; i<N; i++){  # loop 3A
1630           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1631           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1632         }
1633      }
1634      else {
1635         for (i = 3; i<N; i++){  # loop 3B
1636           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1637           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1638         }
1639      }
1640
1641      These loops are later passed to loop_transform to be vectorized.  The
1642      vectorizer will use the alignment information to guide the transformation
1643      (whether to generate regular loads/stores, or with special handling for
1644      misalignment).  */
1645
1646 bool
1647 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1648 {
1649   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1650   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1651   enum dr_alignment_support supportable_dr_alignment;
1652   struct data_reference *dr0 = NULL, *first_store = NULL;
1653   struct data_reference *dr;
1654   unsigned int i, j;
1655   bool do_peeling = false;
1656   bool do_versioning = false;
1657   bool stat;
1658   gimple *stmt;
1659   stmt_vec_info stmt_info;
1660   unsigned int npeel = 0;
1661   bool one_misalignment_known = false;
1662   bool one_misalignment_unknown = false;
1663   bool one_dr_unsupportable = false;
1664   struct data_reference *unsupportable_dr = NULL;
1665   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1666   unsigned possible_npeel_number = 1;
1667   tree vectype;
1668   unsigned int mis, same_align_drs_max = 0;
1669   hash_table<peel_info_hasher> peeling_htab (1);
1670
1671   if (dump_enabled_p ())
1672     dump_printf_loc (MSG_NOTE, vect_location,
1673                      "=== vect_enhance_data_refs_alignment ===\n");
1674
1675   /* Reset data so we can safely be called multiple times.  */
1676   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1677   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1678
1679   /* While cost model enhancements are expected in the future, the high level
1680      view of the code at this time is as follows:
1681
1682      A) If there is a misaligned access then see if peeling to align
1683         this access can make all data references satisfy
1684         vect_supportable_dr_alignment.  If so, update data structures
1685         as needed and return true.
1686
1687      B) If peeling wasn't possible and there is a data reference with an
1688         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1689         then see if loop versioning checks can be used to make all data
1690         references satisfy vect_supportable_dr_alignment.  If so, update
1691         data structures as needed and return true.
1692
1693      C) If neither peeling nor versioning were successful then return false if
1694         any data reference does not satisfy vect_supportable_dr_alignment.
1695
1696      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1697
1698      Note, Possibility 3 above (which is peeling and versioning together) is not
1699      being done at this time.  */
1700
1701   /* (1) Peeling to force alignment.  */
1702
1703   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1704      Considerations:
1705      + How many accesses will become aligned due to the peeling
1706      - How many accesses will become unaligned due to the peeling,
1707        and the cost of misaligned accesses.
1708      - The cost of peeling (the extra runtime checks, the increase
1709        in code size).  */
1710
1711   FOR_EACH_VEC_ELT (datarefs, i, dr)
1712     {
1713       stmt = DR_STMT (dr);
1714       stmt_info = vinfo_for_stmt (stmt);
1715
1716       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1717         continue;
1718
1719       /* For interleaving, only the alignment of the first access
1720          matters.  */
1721       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1722           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1723         continue;
1724
1725       /* For invariant accesses there is nothing to enhance.  */
1726       if (integer_zerop (DR_STEP (dr)))
1727         continue;
1728
1729       /* Strided accesses perform only component accesses, alignment is
1730          irrelevant for them.  */
1731       if (STMT_VINFO_STRIDED_P (stmt_info)
1732           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1733         continue;
1734
1735       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1736       do_peeling = vector_alignment_reachable_p (dr);
1737       if (do_peeling)
1738         {
1739           if (known_alignment_for_access_p (dr))
1740             {
1741               unsigned int npeel_tmp = 0;
1742               bool negative = tree_int_cst_compare (DR_STEP (dr),
1743                                                     size_zero_node) < 0;
1744
1745               vectype = STMT_VINFO_VECTYPE (stmt_info);
1746               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1747               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1748               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1749               if (DR_MISALIGNMENT (dr) != 0)
1750                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1751
1752               /* For multiple types, it is possible that the bigger type access
1753                  will have more than one peeling option.  E.g., a loop with two
1754                  types: one of size (vector size / 4), and the other one of
1755                  size (vector size / 8).  Vectorization factor will 8.  If both
1756                  accesses are misaligned by 3, the first one needs one scalar
1757                  iteration to be aligned, and the second one needs 5.  But the
1758                  first one will be aligned also by peeling 5 scalar
1759                  iterations, and in that case both accesses will be aligned.
1760                  Hence, except for the immediate peeling amount, we also want
1761                  to try to add full vector size, while we don't exceed
1762                  vectorization factor.
1763                  We do this automatically for cost model, since we calculate
1764                  cost for every peeling option.  */
1765               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1766                 {
1767                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1768                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1769                   possible_npeel_number
1770                     = vect_get_num_vectors (nscalars, vectype);
1771
1772                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1773                      allow peeling NELEMENTS.  */
1774                   if (DR_MISALIGNMENT (dr) == 0)
1775                     possible_npeel_number++;
1776                 }
1777
1778               /* Save info about DR in the hash table.  Also include peeling
1779                  amounts according to the explanation above.  */
1780               for (j = 0; j < possible_npeel_number; j++)
1781                 {
1782                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1783                                             dr, npeel_tmp);
1784                   npeel_tmp += target_align / dr_size;
1785                 }
1786
1787               one_misalignment_known = true;
1788             }
1789           else
1790             {
1791               /* If we don't know any misalignment values, we prefer
1792                  peeling for data-ref that has the maximum number of data-refs
1793                  with the same alignment, unless the target prefers to align
1794                  stores over load.  */
1795               unsigned same_align_drs
1796                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1797               if (!dr0
1798                   || same_align_drs_max < same_align_drs)
1799                 {
1800                   same_align_drs_max = same_align_drs;
1801                   dr0 = dr;
1802                 }
1803               /* For data-refs with the same number of related
1804                  accesses prefer the one where the misalign
1805                  computation will be invariant in the outermost loop.  */
1806               else if (same_align_drs_max == same_align_drs)
1807                 {
1808                   struct loop *ivloop0, *ivloop;
1809                   ivloop0 = outermost_invariant_loop_for_expr
1810                     (loop, DR_BASE_ADDRESS (dr0));
1811                   ivloop = outermost_invariant_loop_for_expr
1812                     (loop, DR_BASE_ADDRESS (dr));
1813                   if ((ivloop && !ivloop0)
1814                       || (ivloop && ivloop0
1815                           && flow_loop_nested_p (ivloop, ivloop0)))
1816                     dr0 = dr;
1817                 }
1818
1819               one_misalignment_unknown = true;
1820
1821               /* Check for data refs with unsupportable alignment that
1822                  can be peeled.  */
1823               if (!supportable_dr_alignment)
1824               {
1825                 one_dr_unsupportable = true;
1826                 unsupportable_dr = dr;
1827               }
1828
1829               if (!first_store && DR_IS_WRITE (dr))
1830                 first_store = dr;
1831             }
1832         }
1833       else
1834         {
1835           if (!aligned_access_p (dr))
1836             {
1837               if (dump_enabled_p ())
1838                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839                                  "vector alignment may not be reachable\n");
1840               break;
1841             }
1842         }
1843     }
1844
1845   /* Check if we can possibly peel the loop.  */
1846   if (!vect_can_advance_ivs_p (loop_vinfo)
1847       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1848       || loop->inner)
1849     do_peeling = false;
1850
1851   struct _vect_peel_extended_info peel_for_known_alignment;
1852   struct _vect_peel_extended_info peel_for_unknown_alignment;
1853   struct _vect_peel_extended_info best_peel;
1854
1855   peel_for_unknown_alignment.inside_cost = INT_MAX;
1856   peel_for_unknown_alignment.outside_cost = INT_MAX;
1857   peel_for_unknown_alignment.peel_info.count = 0;
1858
1859   if (do_peeling
1860       && one_misalignment_unknown)
1861     {
1862       /* Check if the target requires to prefer stores over loads, i.e., if
1863          misaligned stores are more expensive than misaligned loads (taking
1864          drs with same alignment into account).  */
1865       unsigned int load_inside_cost = 0;
1866       unsigned int load_outside_cost = 0;
1867       unsigned int store_inside_cost = 0;
1868       unsigned int store_outside_cost = 0;
1869       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1870
1871       stmt_vector_for_cost dummy;
1872       dummy.create (2);
1873       vect_get_peeling_costs_all_drs (datarefs, dr0,
1874                                       &load_inside_cost,
1875                                       &load_outside_cost,
1876                                       &dummy, &dummy, estimated_npeels, true);
1877       dummy.release ();
1878
1879       if (first_store)
1880         {
1881           dummy.create (2);
1882           vect_get_peeling_costs_all_drs (datarefs, first_store,
1883                                           &store_inside_cost,
1884                                           &store_outside_cost,
1885                                           &dummy, &dummy,
1886                                           estimated_npeels, true);
1887           dummy.release ();
1888         }
1889       else
1890         {
1891           store_inside_cost = INT_MAX;
1892           store_outside_cost = INT_MAX;
1893         }
1894
1895       if (load_inside_cost > store_inside_cost
1896           || (load_inside_cost == store_inside_cost
1897               && load_outside_cost > store_outside_cost))
1898         {
1899           dr0 = first_store;
1900           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1901           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1902         }
1903       else
1904         {
1905           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1906           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1907         }
1908
1909       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1910       prologue_cost_vec.create (2);
1911       epilogue_cost_vec.create (2);
1912
1913       int dummy2;
1914       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1915         (loop_vinfo, estimated_npeels, &dummy2,
1916          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1917          &prologue_cost_vec, &epilogue_cost_vec);
1918
1919       prologue_cost_vec.release ();
1920       epilogue_cost_vec.release ();
1921
1922       peel_for_unknown_alignment.peel_info.count = 1
1923         + STMT_VINFO_SAME_ALIGN_REFS
1924         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1925     }
1926
1927   peel_for_unknown_alignment.peel_info.npeel = 0;
1928   peel_for_unknown_alignment.peel_info.dr = dr0;
1929
1930   best_peel = peel_for_unknown_alignment;
1931
1932   peel_for_known_alignment.inside_cost = INT_MAX;
1933   peel_for_known_alignment.outside_cost = INT_MAX;
1934   peel_for_known_alignment.peel_info.count = 0;
1935   peel_for_known_alignment.peel_info.dr = NULL;
1936
1937   if (do_peeling && one_misalignment_known)
1938     {
1939       /* Peeling is possible, but there is no data access that is not supported
1940          unless aligned.  So we try to choose the best possible peeling from
1941          the hash table.  */
1942       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1943         (&peeling_htab, loop_vinfo);
1944     }
1945
1946   /* Compare costs of peeling for known and unknown alignment. */
1947   if (peel_for_known_alignment.peel_info.dr != NULL
1948       && peel_for_unknown_alignment.inside_cost
1949       >= peel_for_known_alignment.inside_cost)
1950     {
1951       best_peel = peel_for_known_alignment;
1952
1953       /* If the best peeling for known alignment has NPEEL == 0, perform no
1954          peeling at all except if there is an unsupportable dr that we can
1955          align.  */
1956       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1957         do_peeling = false;
1958     }
1959
1960   /* If there is an unsupportable data ref, prefer this over all choices so far
1961      since we'd have to discard a chosen peeling except when it accidentally
1962      aligned the unsupportable data ref.  */
1963   if (one_dr_unsupportable)
1964     dr0 = unsupportable_dr;
1965   else if (do_peeling)
1966     {
1967       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1968          TODO: Use nopeel_outside_cost or get rid of it?  */
1969       unsigned nopeel_inside_cost = 0;
1970       unsigned nopeel_outside_cost = 0;
1971
1972       stmt_vector_for_cost dummy;
1973       dummy.create (2);
1974       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1975                                       &nopeel_outside_cost, &dummy, &dummy,
1976                                       0, false);
1977       dummy.release ();
1978
1979       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1980          costs will be recorded.  */
1981       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1982       prologue_cost_vec.create (2);
1983       epilogue_cost_vec.create (2);
1984
1985       int dummy2;
1986       nopeel_outside_cost += vect_get_known_peeling_cost
1987         (loop_vinfo, 0, &dummy2,
1988          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1989          &prologue_cost_vec, &epilogue_cost_vec);
1990
1991       prologue_cost_vec.release ();
1992       epilogue_cost_vec.release ();
1993
1994       npeel = best_peel.peel_info.npeel;
1995       dr0 = best_peel.peel_info.dr;
1996
1997       /* If no peeling is not more expensive than the best peeling we
1998          have so far, don't perform any peeling.  */
1999       if (nopeel_inside_cost <= best_peel.inside_cost)
2000         do_peeling = false;
2001     }
2002
2003   if (do_peeling)
2004     {
2005       stmt = DR_STMT (dr0);
2006       stmt_info = vinfo_for_stmt (stmt);
2007       vectype = STMT_VINFO_VECTYPE (stmt_info);
2008
2009       if (known_alignment_for_access_p (dr0))
2010         {
2011           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2012                                                 size_zero_node) < 0;
2013           if (!npeel)
2014             {
2015               /* Since it's known at compile time, compute the number of
2016                  iterations in the peeled loop (the peeling factor) for use in
2017                  updating DR_MISALIGNMENT values.  The peeling factor is the
2018                  vectorization factor minus the misalignment as an element
2019                  count.  */
2020               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2021               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2022               npeel = ((mis & (target_align - 1))
2023                        / vect_get_scalar_dr_size (dr0));
2024             }
2025
2026           /* For interleaved data access every iteration accesses all the
2027              members of the group, therefore we divide the number of iterations
2028              by the group size.  */
2029           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2030           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2031             npeel /= DR_GROUP_SIZE (stmt_info);
2032
2033           if (dump_enabled_p ())
2034             dump_printf_loc (MSG_NOTE, vect_location,
2035                              "Try peeling by %d\n", npeel);
2036         }
2037
2038       /* Ensure that all datarefs can be vectorized after the peel.  */
2039       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2040         do_peeling = false;
2041
2042       /* Check if all datarefs are supportable and log.  */
2043       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2044         {
2045           stat = vect_verify_datarefs_alignment (loop_vinfo);
2046           if (!stat)
2047             do_peeling = false;
2048           else
2049             return stat;
2050         }
2051
2052       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2053       if (do_peeling)
2054         {
2055           unsigned max_allowed_peel
2056             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2057           if (max_allowed_peel != (unsigned)-1)
2058             {
2059               unsigned max_peel = npeel;
2060               if (max_peel == 0)
2061                 {
2062                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2063                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2064                 }
2065               if (max_peel > max_allowed_peel)
2066                 {
2067                   do_peeling = false;
2068                   if (dump_enabled_p ())
2069                     dump_printf_loc (MSG_NOTE, vect_location,
2070                         "Disable peeling, max peels reached: %d\n", max_peel);
2071                 }
2072             }
2073         }
2074
2075       /* Cost model #2 - if peeling may result in a remaining loop not
2076          iterating enough to be vectorized then do not peel.  Since this
2077          is a cost heuristic rather than a correctness decision, use the
2078          most likely runtime value for variable vectorization factors.  */
2079       if (do_peeling
2080           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2081         {
2082           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2083           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2084           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2085               < assumed_vf + max_peel)
2086             do_peeling = false;
2087         }
2088
2089       if (do_peeling)
2090         {
2091           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2092              If the misalignment of DR_i is identical to that of dr0 then set
2093              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2094              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2095              by the peeling factor times the element size of DR_i (MOD the
2096              vectorization factor times the size).  Otherwise, the
2097              misalignment of DR_i must be set to unknown.  */
2098           FOR_EACH_VEC_ELT (datarefs, i, dr)
2099             if (dr != dr0)
2100               {
2101                 /* Strided accesses perform only component accesses, alignment
2102                    is irrelevant for them.  */
2103                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2104                 if (STMT_VINFO_STRIDED_P (stmt_info)
2105                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2106                   continue;
2107
2108                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2109               }
2110
2111           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2112           if (npeel)
2113             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2114           else
2115             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2116               = DR_MISALIGNMENT (dr0);
2117           SET_DR_MISALIGNMENT (dr0, 0);
2118           if (dump_enabled_p ())
2119             {
2120               dump_printf_loc (MSG_NOTE, vect_location,
2121                                "Alignment of access forced using peeling.\n");
2122               dump_printf_loc (MSG_NOTE, vect_location,
2123                                "Peeling for alignment will be applied.\n");
2124             }
2125
2126           /* The inside-loop cost will be accounted for in vectorizable_load
2127              and vectorizable_store correctly with adjusted alignments.
2128              Drop the body_cst_vec on the floor here.  */
2129           stat = vect_verify_datarefs_alignment (loop_vinfo);
2130           gcc_assert (stat);
2131           return stat;
2132         }
2133     }
2134
2135   /* (2) Versioning to force alignment.  */
2136
2137   /* Try versioning if:
2138      1) optimize loop for speed
2139      2) there is at least one unsupported misaligned data ref with an unknown
2140         misalignment, and
2141      3) all misaligned data refs with a known misalignment are supported, and
2142      4) the number of runtime alignment checks is within reason.  */
2143
2144   do_versioning =
2145         optimize_loop_nest_for_speed_p (loop)
2146         && (!loop->inner); /* FORNOW */
2147
2148   if (do_versioning)
2149     {
2150       FOR_EACH_VEC_ELT (datarefs, i, dr)
2151         {
2152           stmt = DR_STMT (dr);
2153           stmt_info = vinfo_for_stmt (stmt);
2154
2155           /* For interleaving, only the alignment of the first access
2156              matters.  */
2157           if (aligned_access_p (dr)
2158               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2159                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2160             continue;
2161
2162           if (STMT_VINFO_STRIDED_P (stmt_info))
2163             {
2164               /* Strided loads perform only component accesses, alignment is
2165                  irrelevant for them.  */
2166               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2167                 continue;
2168               do_versioning = false;
2169               break;
2170             }
2171
2172           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2173
2174           if (!supportable_dr_alignment)
2175             {
2176               gimple *stmt;
2177               int mask;
2178               tree vectype;
2179
2180               if (known_alignment_for_access_p (dr)
2181                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2182                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2183                 {
2184                   do_versioning = false;
2185                   break;
2186                 }
2187
2188               stmt = DR_STMT (dr);
2189               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2190               gcc_assert (vectype);
2191
2192               /* At present we don't support versioning for alignment
2193                  with variable VF, since there's no guarantee that the
2194                  VF is a power of two.  We could relax this if we added
2195                  a way of enforcing a power-of-two size.  */
2196               unsigned HOST_WIDE_INT size;
2197               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2198                 {
2199                   do_versioning = false;
2200                   break;
2201                 }
2202
2203               /* The rightmost bits of an aligned address must be zeros.
2204                  Construct the mask needed for this test.  For example,
2205                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2206                  mask must be 15 = 0xf. */
2207               mask = size - 1;
2208
2209               /* FORNOW: use the same mask to test all potentially unaligned
2210                  references in the loop.  The vectorizer currently supports
2211                  a single vector size, see the reference to
2212                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2213                  vectorization factor is computed.  */
2214               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2215                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2216               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2217               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2218                       DR_STMT (dr));
2219             }
2220         }
2221
2222       /* Versioning requires at least one misaligned data reference.  */
2223       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2224         do_versioning = false;
2225       else if (!do_versioning)
2226         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2227     }
2228
2229   if (do_versioning)
2230     {
2231       vec<gimple *> may_misalign_stmts
2232         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2233       gimple *stmt;
2234
2235       /* It can now be assumed that the data references in the statements
2236          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2237          of the loop being vectorized.  */
2238       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2239         {
2240           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2241           dr = STMT_VINFO_DATA_REF (stmt_info);
2242           SET_DR_MISALIGNMENT (dr, 0);
2243           if (dump_enabled_p ())
2244             dump_printf_loc (MSG_NOTE, vect_location,
2245                              "Alignment of access forced using versioning.\n");
2246         }
2247
2248       if (dump_enabled_p ())
2249         dump_printf_loc (MSG_NOTE, vect_location,
2250                          "Versioning for alignment will be applied.\n");
2251
2252       /* Peeling and versioning can't be done together at this time.  */
2253       gcc_assert (! (do_peeling && do_versioning));
2254
2255       stat = vect_verify_datarefs_alignment (loop_vinfo);
2256       gcc_assert (stat);
2257       return stat;
2258     }
2259
2260   /* This point is reached if neither peeling nor versioning is being done.  */
2261   gcc_assert (! (do_peeling || do_versioning));
2262
2263   stat = vect_verify_datarefs_alignment (loop_vinfo);
2264   return stat;
2265 }
2266
2267
2268 /* Function vect_find_same_alignment_drs.
2269
2270    Update group and alignment relations according to the chosen
2271    vectorization factor.  */
2272
2273 static void
2274 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2275 {
2276   struct data_reference *dra = DDR_A (ddr);
2277   struct data_reference *drb = DDR_B (ddr);
2278   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2279   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2280
2281   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2282     return;
2283
2284   if (dra == drb)
2285     return;
2286
2287   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2288       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2289       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2290     return;
2291
2292   /* Two references with distance zero have the same alignment.  */
2293   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2294                           - wi::to_poly_offset (DR_INIT (drb)));
2295   if (maybe_ne (diff, 0))
2296     {
2297       /* Get the wider of the two alignments.  */
2298       unsigned int align_a = (vect_calculate_target_alignment (dra)
2299                               / BITS_PER_UNIT);
2300       unsigned int align_b = (vect_calculate_target_alignment (drb)
2301                               / BITS_PER_UNIT);
2302       unsigned int max_align = MAX (align_a, align_b);
2303
2304       /* Require the gap to be a multiple of the larger vector alignment.  */
2305       if (!multiple_p (diff, max_align))
2306         return;
2307     }
2308
2309   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2310   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2311   if (dump_enabled_p ())
2312     {
2313       dump_printf_loc (MSG_NOTE, vect_location,
2314                        "accesses have the same alignment: ");
2315       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2316       dump_printf (MSG_NOTE,  " and ");
2317       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2318       dump_printf (MSG_NOTE, "\n");
2319     }
2320 }
2321
2322
2323 /* Function vect_analyze_data_refs_alignment
2324
2325    Analyze the alignment of the data-references in the loop.
2326    Return FALSE if a data reference is found that cannot be vectorized.  */
2327
2328 bool
2329 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2330 {
2331   if (dump_enabled_p ())
2332     dump_printf_loc (MSG_NOTE, vect_location,
2333                      "=== vect_analyze_data_refs_alignment ===\n");
2334
2335   /* Mark groups of data references with same alignment using
2336      data dependence information.  */
2337   vec<ddr_p> ddrs = vinfo->ddrs;
2338   struct data_dependence_relation *ddr;
2339   unsigned int i;
2340
2341   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2342     vect_find_same_alignment_drs (ddr);
2343
2344   vec<data_reference_p> datarefs = vinfo->datarefs;
2345   struct data_reference *dr;
2346
2347   vect_record_base_alignments (vinfo);
2348   FOR_EACH_VEC_ELT (datarefs, i, dr)
2349     {
2350       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2351       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2352           && !vect_compute_data_ref_alignment (dr))
2353         {
2354           /* Strided accesses perform only component accesses, misalignment
2355              information is irrelevant for them.  */
2356           if (STMT_VINFO_STRIDED_P (stmt_info)
2357               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2358             continue;
2359
2360           if (dump_enabled_p ())
2361             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362                              "not vectorized: can't calculate alignment "
2363                              "for data ref.\n");
2364
2365           return false;
2366         }
2367     }
2368
2369   return true;
2370 }
2371
2372
2373 /* Analyze alignment of DRs of stmts in NODE.  */
2374
2375 static bool
2376 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2377 {
2378   /* We vectorize from the first scalar stmt in the node unless
2379      the node is permuted in which case we start from the first
2380      element in the group.  */
2381   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2382   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2383   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2384     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2385
2386   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2387   if (! vect_compute_data_ref_alignment (dr)
2388       /* For creating the data-ref pointer we need alignment of the
2389          first element anyway.  */
2390       || (dr != first_dr
2391           && ! vect_compute_data_ref_alignment (first_dr))
2392       || ! verify_data_ref_alignment (dr))
2393     {
2394       if (dump_enabled_p ())
2395         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2396                          "not vectorized: bad data alignment in basic "
2397                          "block.\n");
2398       return false;
2399     }
2400
2401   return true;
2402 }
2403
2404 /* Function vect_slp_analyze_instance_alignment
2405
2406    Analyze the alignment of the data-references in the SLP instance.
2407    Return FALSE if a data reference is found that cannot be vectorized.  */
2408
2409 bool
2410 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2411 {
2412   if (dump_enabled_p ())
2413     dump_printf_loc (MSG_NOTE, vect_location,
2414                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2415
2416   slp_tree node;
2417   unsigned i;
2418   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2419     if (! vect_slp_analyze_and_verify_node_alignment (node))
2420       return false;
2421
2422   node = SLP_INSTANCE_TREE (instance);
2423   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2424       && ! vect_slp_analyze_and_verify_node_alignment
2425              (SLP_INSTANCE_TREE (instance)))
2426     return false;
2427
2428   return true;
2429 }
2430
2431
2432 /* Analyze groups of accesses: check that DR belongs to a group of
2433    accesses of legal size, step, etc.  Detect gaps, single element
2434    interleaving, and other special cases. Set grouped access info.
2435    Collect groups of strided stores for further use in SLP analysis.
2436    Worker for vect_analyze_group_access.  */
2437
2438 static bool
2439 vect_analyze_group_access_1 (struct data_reference *dr)
2440 {
2441   tree step = DR_STEP (dr);
2442   tree scalar_type = TREE_TYPE (DR_REF (dr));
2443   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2444   gimple *stmt = DR_STMT (dr);
2445   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2446   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2447   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2448   HOST_WIDE_INT dr_step = -1;
2449   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2450   bool slp_impossible = false;
2451
2452   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2453      size of the interleaving group (including gaps).  */
2454   if (tree_fits_shwi_p (step))
2455     {
2456       dr_step = tree_to_shwi (step);
2457       /* Check that STEP is a multiple of type size.  Otherwise there is
2458          a non-element-sized gap at the end of the group which we
2459          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2460          ???  As we can handle non-constant step fine here we should
2461          simply remove uses of DR_GROUP_GAP between the last and first
2462          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2463          simply not include that gap.  */
2464       if ((dr_step % type_size) != 0)
2465         {
2466           if (dump_enabled_p ())
2467             {
2468               dump_printf_loc (MSG_NOTE, vect_location,
2469                                "Step ");
2470               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2471               dump_printf (MSG_NOTE,
2472                            " is not a multiple of the element size for ");
2473               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2474               dump_printf (MSG_NOTE, "\n");
2475             }
2476           return false;
2477         }
2478       groupsize = absu_hwi (dr_step) / type_size;
2479     }
2480   else
2481     groupsize = 0;
2482
2483   /* Not consecutive access is possible only if it is a part of interleaving.  */
2484   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2485     {
2486       /* Check if it this DR is a part of interleaving, and is a single
2487          element of the group that is accessed in the loop.  */
2488
2489       /* Gaps are supported only for loads. STEP must be a multiple of the type
2490          size.  */
2491       if (DR_IS_READ (dr)
2492           && (dr_step % type_size) == 0
2493           && groupsize > 0)
2494         {
2495           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2496           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2497           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2498           if (dump_enabled_p ())
2499             {
2500               dump_printf_loc (MSG_NOTE, vect_location,
2501                                "Detected single element interleaving ");
2502               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2503               dump_printf (MSG_NOTE, " step ");
2504               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2505               dump_printf (MSG_NOTE, "\n");
2506             }
2507
2508           return true;
2509         }
2510
2511       if (dump_enabled_p ())
2512         {
2513           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2514                            "not consecutive access ");
2515           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2516         }
2517
2518       if (bb_vinfo)
2519         {
2520           /* Mark the statement as unvectorizable.  */
2521           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2522           return true;
2523         }
2524
2525       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2526       STMT_VINFO_STRIDED_P (stmt_info) = true;
2527       return true;
2528     }
2529
2530   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2531     {
2532       /* First stmt in the interleaving chain. Check the chain.  */
2533       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2534       struct data_reference *data_ref = dr;
2535       unsigned int count = 1;
2536       tree prev_init = DR_INIT (data_ref);
2537       gimple *prev = stmt;
2538       HOST_WIDE_INT diff, gaps = 0;
2539
2540       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2541       while (next)
2542         {
2543           /* Skip same data-refs.  In case that two or more stmts share
2544              data-ref (supported only for loads), we vectorize only the first
2545              stmt, and the rest get their vectorized loads from the first
2546              one.  */
2547           if (!tree_int_cst_compare (DR_INIT (data_ref),
2548                                      DR_INIT (STMT_VINFO_DATA_REF (
2549                                                    vinfo_for_stmt (next)))))
2550             {
2551               if (DR_IS_WRITE (data_ref))
2552                 {
2553                   if (dump_enabled_p ())
2554                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555                                      "Two store stmts share the same dr.\n");
2556                   return false;
2557                 }
2558
2559               if (dump_enabled_p ())
2560                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2561                                  "Two or more load stmts share the same dr.\n");
2562
2563               /* For load use the same data-ref load.  */
2564               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2565
2566               prev = next;
2567               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2568               continue;
2569             }
2570
2571           prev = next;
2572           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2573
2574           /* All group members have the same STEP by construction.  */
2575           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2576
2577           /* Check that the distance between two accesses is equal to the type
2578              size. Otherwise, we have gaps.  */
2579           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2580                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2581           if (diff != 1)
2582             {
2583               /* FORNOW: SLP of accesses with gaps is not supported.  */
2584               slp_impossible = true;
2585               if (DR_IS_WRITE (data_ref))
2586                 {
2587                   if (dump_enabled_p ())
2588                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2589                                      "interleaved store with gaps\n");
2590                   return false;
2591                 }
2592
2593               gaps += diff - 1;
2594             }
2595
2596           last_accessed_element += diff;
2597
2598           /* Store the gap from the previous member of the group. If there is no
2599              gap in the access, DR_GROUP_GAP is always 1.  */
2600           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2601
2602           prev_init = DR_INIT (data_ref);
2603           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2604           /* Count the number of data-refs in the chain.  */
2605           count++;
2606         }
2607
2608       if (groupsize == 0)
2609         groupsize = count + gaps;
2610
2611       /* This could be UINT_MAX but as we are generating code in a very
2612          inefficient way we have to cap earlier.  See PR78699 for example.  */
2613       if (groupsize > 4096)
2614         {
2615           if (dump_enabled_p ())
2616             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617                              "group is too large\n");
2618           return false;
2619         }
2620
2621       /* Check that the size of the interleaving is equal to count for stores,
2622          i.e., that there are no gaps.  */
2623       if (groupsize != count
2624           && !DR_IS_READ (dr))
2625         {
2626           if (dump_enabled_p ())
2627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2628                              "interleaved store with gaps\n");
2629           return false;
2630         }
2631
2632       /* If there is a gap after the last load in the group it is the
2633          difference between the groupsize and the last accessed
2634          element.
2635          When there is no gap, this difference should be 0.  */
2636       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2637
2638       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2639       if (dump_enabled_p ())
2640         {
2641           dump_printf_loc (MSG_NOTE, vect_location,
2642                            "Detected interleaving ");
2643           if (DR_IS_READ (dr))
2644             dump_printf (MSG_NOTE, "load ");
2645           else
2646             dump_printf (MSG_NOTE, "store ");
2647           dump_printf (MSG_NOTE, "of size %u starting with ",
2648                        (unsigned)groupsize);
2649           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2650           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2651             dump_printf_loc (MSG_NOTE, vect_location,
2652                              "There is a gap of %u elements after the group\n",
2653                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2654         }
2655
2656       /* SLP: create an SLP data structure for every interleaving group of
2657          stores for further analysis in vect_analyse_slp.  */
2658       if (DR_IS_WRITE (dr) && !slp_impossible)
2659         {
2660           if (loop_vinfo)
2661             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2662           if (bb_vinfo)
2663             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2664         }
2665     }
2666
2667   return true;
2668 }
2669
2670 /* Analyze groups of accesses: check that DR belongs to a group of
2671    accesses of legal size, step, etc.  Detect gaps, single element
2672    interleaving, and other special cases. Set grouped access info.
2673    Collect groups of strided stores for further use in SLP analysis.  */
2674
2675 static bool
2676 vect_analyze_group_access (struct data_reference *dr)
2677 {
2678   if (!vect_analyze_group_access_1 (dr))
2679     {
2680       /* Dissolve the group if present.  */
2681       gimple *next;
2682       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2683       while (stmt)
2684         {
2685           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2686           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2687           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2688           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2689           stmt = next;
2690         }
2691       return false;
2692     }
2693   return true;
2694 }
2695
2696 /* Analyze the access pattern of the data-reference DR.
2697    In case of non-consecutive accesses call vect_analyze_group_access() to
2698    analyze groups of accesses.  */
2699
2700 static bool
2701 vect_analyze_data_ref_access (struct data_reference *dr)
2702 {
2703   tree step = DR_STEP (dr);
2704   tree scalar_type = TREE_TYPE (DR_REF (dr));
2705   gimple *stmt = DR_STMT (dr);
2706   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2707   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2708   struct loop *loop = NULL;
2709
2710   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2711     return true;
2712
2713   if (loop_vinfo)
2714     loop = LOOP_VINFO_LOOP (loop_vinfo);
2715
2716   if (loop_vinfo && !step)
2717     {
2718       if (dump_enabled_p ())
2719         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2720                          "bad data-ref access in loop\n");
2721       return false;
2722     }
2723
2724   /* Allow loads with zero step in inner-loop vectorization.  */
2725   if (loop_vinfo && integer_zerop (step))
2726     {
2727       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2728       if (!nested_in_vect_loop_p (loop, stmt))
2729         return DR_IS_READ (dr);
2730       /* Allow references with zero step for outer loops marked
2731          with pragma omp simd only - it guarantees absence of
2732          loop-carried dependencies between inner loop iterations.  */
2733       if (loop->safelen < 2)
2734         {
2735           if (dump_enabled_p ())
2736             dump_printf_loc (MSG_NOTE, vect_location,
2737                              "zero step in inner loop of nest\n");
2738           return false;
2739         }
2740     }
2741
2742   if (loop && nested_in_vect_loop_p (loop, stmt))
2743     {
2744       /* Interleaved accesses are not yet supported within outer-loop
2745         vectorization for references in the inner-loop.  */
2746       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2747
2748       /* For the rest of the analysis we use the outer-loop step.  */
2749       step = STMT_VINFO_DR_STEP (stmt_info);
2750       if (integer_zerop (step))
2751         {
2752           if (dump_enabled_p ())
2753             dump_printf_loc (MSG_NOTE, vect_location,
2754                              "zero step in outer loop.\n");
2755           return DR_IS_READ (dr);
2756         }
2757     }
2758
2759   /* Consecutive?  */
2760   if (TREE_CODE (step) == INTEGER_CST)
2761     {
2762       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2763       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2764           || (dr_step < 0
2765               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2766         {
2767           /* Mark that it is not interleaving.  */
2768           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2769           return true;
2770         }
2771     }
2772
2773   if (loop && nested_in_vect_loop_p (loop, stmt))
2774     {
2775       if (dump_enabled_p ())
2776         dump_printf_loc (MSG_NOTE, vect_location,
2777                          "grouped access in outer loop.\n");
2778       return false;
2779     }
2780
2781
2782   /* Assume this is a DR handled by non-constant strided load case.  */
2783   if (TREE_CODE (step) != INTEGER_CST)
2784     return (STMT_VINFO_STRIDED_P (stmt_info)
2785             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2786                 || vect_analyze_group_access (dr)));
2787
2788   /* Not consecutive access - check if it's a part of interleaving group.  */
2789   return vect_analyze_group_access (dr);
2790 }
2791
2792 /* Compare two data-references DRA and DRB to group them into chunks
2793    suitable for grouping.  */
2794
2795 static int
2796 dr_group_sort_cmp (const void *dra_, const void *drb_)
2797 {
2798   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2799   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2800   int cmp;
2801
2802   /* Stabilize sort.  */
2803   if (dra == drb)
2804     return 0;
2805
2806   /* DRs in different loops never belong to the same group.  */
2807   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2808   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2809   if (loopa != loopb)
2810     return loopa->num < loopb->num ? -1 : 1;
2811
2812   /* Ordering of DRs according to base.  */
2813   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2814                                DR_BASE_ADDRESS (drb));
2815   if (cmp != 0)
2816     return cmp;
2817
2818   /* And according to DR_OFFSET.  */
2819   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2820   if (cmp != 0)
2821     return cmp;
2822
2823   /* Put reads before writes.  */
2824   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2825     return DR_IS_READ (dra) ? -1 : 1;
2826
2827   /* Then sort after access size.  */
2828   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2829                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2830   if (cmp != 0)
2831     return cmp;
2832
2833   /* And after step.  */
2834   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2835   if (cmp != 0)
2836     return cmp;
2837
2838   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2839   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2840   if (cmp == 0)
2841     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2842   return cmp;
2843 }
2844
2845 /* If OP is the result of a conversion, return the unconverted value,
2846    otherwise return null.  */
2847
2848 static tree
2849 strip_conversion (tree op)
2850 {
2851   if (TREE_CODE (op) != SSA_NAME)
2852     return NULL_TREE;
2853   gimple *stmt = SSA_NAME_DEF_STMT (op);
2854   if (!is_gimple_assign (stmt)
2855       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2856     return NULL_TREE;
2857   return gimple_assign_rhs1 (stmt);
2858 }
2859
2860 /* Return true if vectorizable_* routines can handle statements STMT1
2861    and STMT2 being in a single group.  */
2862
2863 static bool
2864 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2865 {
2866   if (gimple_assign_single_p (stmt1))
2867     return gimple_assign_single_p (stmt2);
2868
2869   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2870     {
2871       /* Check for two masked loads or two masked stores.  */
2872       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2873         return false;
2874       internal_fn ifn = gimple_call_internal_fn (stmt1);
2875       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2876         return false;
2877       if (ifn != gimple_call_internal_fn (stmt2))
2878         return false;
2879
2880       /* Check that the masks are the same.  Cope with casts of masks,
2881          like those created by build_mask_conversion.  */
2882       tree mask1 = gimple_call_arg (stmt1, 2);
2883       tree mask2 = gimple_call_arg (stmt2, 2);
2884       if (!operand_equal_p (mask1, mask2, 0))
2885         {
2886           mask1 = strip_conversion (mask1);
2887           if (!mask1)
2888             return false;
2889           mask2 = strip_conversion (mask2);
2890           if (!mask2)
2891             return false;
2892           if (!operand_equal_p (mask1, mask2, 0))
2893             return false;
2894         }
2895       return true;
2896     }
2897
2898   return false;
2899 }
2900
2901 /* Function vect_analyze_data_ref_accesses.
2902
2903    Analyze the access pattern of all the data references in the loop.
2904
2905    FORNOW: the only access pattern that is considered vectorizable is a
2906            simple step 1 (consecutive) access.
2907
2908    FORNOW: handle only arrays and pointer accesses.  */
2909
2910 bool
2911 vect_analyze_data_ref_accesses (vec_info *vinfo)
2912 {
2913   unsigned int i;
2914   vec<data_reference_p> datarefs = vinfo->datarefs;
2915   struct data_reference *dr;
2916
2917   if (dump_enabled_p ())
2918     dump_printf_loc (MSG_NOTE, vect_location,
2919                      "=== vect_analyze_data_ref_accesses ===\n");
2920
2921   if (datarefs.is_empty ())
2922     return true;
2923
2924   /* Sort the array of datarefs to make building the interleaving chains
2925      linear.  Don't modify the original vector's order, it is needed for
2926      determining what dependencies are reversed.  */
2927   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2928   datarefs_copy.qsort (dr_group_sort_cmp);
2929
2930   /* Build the interleaving chains.  */
2931   for (i = 0; i < datarefs_copy.length () - 1;)
2932     {
2933       data_reference_p dra = datarefs_copy[i];
2934       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2935       stmt_vec_info lastinfo = NULL;
2936       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2937           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2938         {
2939           ++i;
2940           continue;
2941         }
2942       for (i = i + 1; i < datarefs_copy.length (); ++i)
2943         {
2944           data_reference_p drb = datarefs_copy[i];
2945           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2946           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2947               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2948             break;
2949
2950           /* ???  Imperfect sorting (non-compatible types, non-modulo
2951              accesses, same accesses) can lead to a group to be artificially
2952              split here as we don't just skip over those.  If it really
2953              matters we can push those to a worklist and re-iterate
2954              over them.  The we can just skip ahead to the next DR here.  */
2955
2956           /* DRs in a different loop should not be put into the same
2957              interleaving group.  */
2958           if (gimple_bb (DR_STMT (dra))->loop_father
2959               != gimple_bb (DR_STMT (drb))->loop_father)
2960             break;
2961
2962           /* Check that the data-refs have same first location (except init)
2963              and they are both either store or load (not load and store,
2964              not masked loads or stores).  */
2965           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2966               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2967                                         DR_BASE_ADDRESS (drb)) != 0
2968               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2969               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2970             break;
2971
2972           /* Check that the data-refs have the same constant size.  */
2973           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2974           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2975           if (!tree_fits_uhwi_p (sza)
2976               || !tree_fits_uhwi_p (szb)
2977               || !tree_int_cst_equal (sza, szb))
2978             break;
2979
2980           /* Check that the data-refs have the same step.  */
2981           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2982             break;
2983
2984           /* Check the types are compatible.
2985              ???  We don't distinguish this during sorting.  */
2986           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2987                                    TREE_TYPE (DR_REF (drb))))
2988             break;
2989
2990           /* Check that the DR_INITs are compile-time constants.  */
2991           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2992               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2993             break;
2994
2995           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2996           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2997           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2998           HOST_WIDE_INT init_prev
2999             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3000           gcc_assert (init_a <= init_b
3001                       && init_a <= init_prev
3002                       && init_prev <= init_b);
3003
3004           /* Do not place the same access in the interleaving chain twice.  */
3005           if (init_b == init_prev)
3006             {
3007               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3008                           < gimple_uid (DR_STMT (drb)));
3009               /* ???  For now we simply "drop" the later reference which is
3010                  otherwise the same rather than finishing off this group.
3011                  In the end we'd want to re-process duplicates forming
3012                  multiple groups from the refs, likely by just collecting
3013                  all candidates (including duplicates and split points
3014                  below) in a vector and then process them together.  */
3015               continue;
3016             }
3017
3018           /* If init_b == init_a + the size of the type * k, we have an
3019              interleaving, and DRA is accessed before DRB.  */
3020           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3021           if (type_size_a == 0
3022               || (init_b - init_a) % type_size_a != 0)
3023             break;
3024
3025           /* If we have a store, the accesses are adjacent.  This splits
3026              groups into chunks we support (we don't support vectorization
3027              of stores with gaps).  */
3028           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3029             break;
3030
3031           /* If the step (if not zero or non-constant) is greater than the
3032              difference between data-refs' inits this splits groups into
3033              suitable sizes.  */
3034           if (tree_fits_shwi_p (DR_STEP (dra)))
3035             {
3036               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3037               if (step != 0 && step <= (init_b - init_a))
3038                 break;
3039             }
3040
3041           if (dump_enabled_p ())
3042             {
3043               dump_printf_loc (MSG_NOTE, vect_location,
3044                                "Detected interleaving ");
3045               if (DR_IS_READ (dra))
3046                 dump_printf (MSG_NOTE, "load ");
3047               else
3048                 dump_printf (MSG_NOTE, "store ");
3049               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3050               dump_printf (MSG_NOTE,  " and ");
3051               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3052               dump_printf (MSG_NOTE, "\n");
3053             }
3054
3055           /* Link the found element into the group list.  */
3056           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3057             {
3058               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3059               lastinfo = stmtinfo_a;
3060             }
3061           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3062           DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3063           lastinfo = stmtinfo_b;
3064         }
3065     }
3066
3067   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3068     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3069         && !vect_analyze_data_ref_access (dr))
3070       {
3071         if (dump_enabled_p ())
3072           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3073                            "not vectorized: complicated access pattern.\n");
3074
3075         if (is_a <bb_vec_info> (vinfo))
3076           {
3077             /* Mark the statement as not vectorizable.  */
3078             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3079             continue;
3080           }
3081         else
3082           {
3083             datarefs_copy.release ();
3084             return false;
3085           }
3086       }
3087
3088   datarefs_copy.release ();
3089   return true;
3090 }
3091
3092 /* Function vect_vfa_segment_size.
3093
3094    Input:
3095      DR: The data reference.
3096      LENGTH_FACTOR: segment length to consider.
3097
3098    Return a value suitable for the dr_with_seg_len::seg_len field.
3099    This is the "distance travelled" by the pointer from the first
3100    iteration in the segment to the last.  Note that it does not include
3101    the size of the access; in effect it only describes the first byte.  */
3102
3103 static tree
3104 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3105 {
3106   length_factor = size_binop (MINUS_EXPR,
3107                               fold_convert (sizetype, length_factor),
3108                               size_one_node);
3109   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3110                      length_factor);
3111 }
3112
3113 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3114    gives the worst-case number of bytes covered by the segment.  */
3115
3116 static unsigned HOST_WIDE_INT
3117 vect_vfa_access_size (data_reference *dr)
3118 {
3119   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3120   tree ref_type = TREE_TYPE (DR_REF (dr));
3121   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3122   unsigned HOST_WIDE_INT access_size = ref_size;
3123   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3124     {
3125       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3126       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3127     }
3128   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3129       && (vect_supportable_dr_alignment (dr, false)
3130           == dr_explicit_realign_optimized))
3131     {
3132       /* We might access a full vector's worth.  */
3133       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3134       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3135     }
3136   return access_size;
3137 }
3138
3139 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3140
3141 static unsigned int
3142 vect_vfa_align (const data_reference *dr)
3143 {
3144   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3145 }
3146
3147 /* Function vect_no_alias_p.
3148
3149    Given data references A and B with equal base and offset, see whether
3150    the alias relation can be decided at compilation time.  Return 1 if
3151    it can and the references alias, 0 if it can and the references do
3152    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3153    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3154    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3155
3156 static int
3157 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3158                          tree segment_length_a, tree segment_length_b,
3159                          unsigned HOST_WIDE_INT access_size_a,
3160                          unsigned HOST_WIDE_INT access_size_b)
3161 {
3162   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3163   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3164   poly_uint64 const_length_a;
3165   poly_uint64 const_length_b;
3166
3167   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3168      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3169      [a, a+12) */
3170   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3171     {
3172       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3173       offset_a = (offset_a + access_size_a) - const_length_a;
3174     }
3175   else
3176     const_length_a = tree_to_poly_uint64 (segment_length_a);
3177   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3178     {
3179       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3180       offset_b = (offset_b + access_size_b) - const_length_b;
3181     }
3182   else
3183     const_length_b = tree_to_poly_uint64 (segment_length_b);
3184
3185   const_length_a += access_size_a;
3186   const_length_b += access_size_b;
3187
3188   if (ranges_known_overlap_p (offset_a, const_length_a,
3189                               offset_b, const_length_b))
3190     return 1;
3191
3192   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3193                                offset_b, const_length_b))
3194     return 0;
3195
3196   return -1;
3197 }
3198
3199 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3200    in DDR is >= VF.  */
3201
3202 static bool
3203 dependence_distance_ge_vf (data_dependence_relation *ddr,
3204                            unsigned int loop_depth, poly_uint64 vf)
3205 {
3206   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3207       || DDR_NUM_DIST_VECTS (ddr) == 0)
3208     return false;
3209
3210   /* If the dependence is exact, we should have limited the VF instead.  */
3211   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3212
3213   unsigned int i;
3214   lambda_vector dist_v;
3215   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3216     {
3217       HOST_WIDE_INT dist = dist_v[loop_depth];
3218       if (dist != 0
3219           && !(dist > 0 && DDR_REVERSED_P (ddr))
3220           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3221         return false;
3222     }
3223
3224   if (dump_enabled_p ())
3225     {
3226       dump_printf_loc (MSG_NOTE, vect_location,
3227                        "dependence distance between ");
3228       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3229       dump_printf (MSG_NOTE,  " and ");
3230       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3231       dump_printf (MSG_NOTE,  " is >= VF\n");
3232     }
3233
3234   return true;
3235 }
3236
3237 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3238
3239 static void
3240 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3241 {
3242   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3243   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3244   dump_printf (dump_kind, ") >= ");
3245   dump_dec (dump_kind, lower_bound.min_value);
3246 }
3247
3248 /* Record that the vectorized loop requires the vec_lower_bound described
3249    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3250
3251 static void
3252 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3253                         poly_uint64 min_value)
3254 {
3255   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3256   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3257     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3258       {
3259         unsigned_p &= lower_bounds[i].unsigned_p;
3260         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3261         if (lower_bounds[i].unsigned_p != unsigned_p
3262             || maybe_lt (lower_bounds[i].min_value, min_value))
3263           {
3264             lower_bounds[i].unsigned_p = unsigned_p;
3265             lower_bounds[i].min_value = min_value;
3266             if (dump_enabled_p ())
3267               {
3268                 dump_printf_loc (MSG_NOTE, vect_location,
3269                                  "updating run-time check to ");
3270                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3271                 dump_printf (MSG_NOTE, "\n");
3272               }
3273           }
3274         return;
3275       }
3276
3277   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3278   if (dump_enabled_p ())
3279     {
3280       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3281       dump_lower_bound (MSG_NOTE, lower_bound);
3282       dump_printf (MSG_NOTE, "\n");
3283     }
3284   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3285 }
3286
3287 /* Return true if it's unlikely that the step of the vectorized form of DR
3288    will span fewer than GAP bytes.  */
3289
3290 static bool
3291 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3292 {
3293   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3294   HOST_WIDE_INT count
3295     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3296   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3297     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3298   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3299 }
3300
3301 /* Return true if we know that there is no alias between DR_A and DR_B
3302    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3303    *LOWER_BOUND_OUT to this N.  */
3304
3305 static bool
3306 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3307                                 poly_uint64 *lower_bound_out)
3308 {
3309   /* Check that there is a constant gap of known sign between DR_A
3310      and DR_B.  */
3311   poly_int64 init_a, init_b;
3312   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3313       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3314       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3315       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3316       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3317       || !ordered_p (init_a, init_b))
3318     return false;
3319
3320   /* Sort DR_A and DR_B by the address they access.  */
3321   if (maybe_lt (init_b, init_a))
3322     {
3323       std::swap (init_a, init_b);
3324       std::swap (dr_a, dr_b);
3325     }
3326
3327   /* If the two accesses could be dependent within a scalar iteration,
3328      make sure that we'd retain their order.  */
3329   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3330       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3331     return false;
3332
3333   /* There is no alias if abs (DR_STEP) is greater than or equal to
3334      the bytes spanned by the combination of the two accesses.  */
3335   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3336   return true;
3337 }
3338
3339 /* Function vect_prune_runtime_alias_test_list.
3340
3341    Prune a list of ddrs to be tested at run-time by versioning for alias.
3342    Merge several alias checks into one if possible.
3343    Return FALSE if resulting list of ddrs is longer then allowed by
3344    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3345
3346 bool
3347 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3348 {
3349   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3350   hash_set <tree_pair_hash> compared_objects;
3351
3352   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3353   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3354     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3355   vec<vec_object_pair> &check_unequal_addrs
3356     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3357   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3358   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3359
3360   ddr_p ddr;
3361   unsigned int i;
3362   tree length_factor;
3363
3364   if (dump_enabled_p ())
3365     dump_printf_loc (MSG_NOTE, vect_location,
3366                      "=== vect_prune_runtime_alias_test_list ===\n");
3367
3368   /* Step values are irrelevant for aliasing if the number of vector
3369      iterations is equal to the number of scalar iterations (which can
3370      happen for fully-SLP loops).  */
3371   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3372
3373   if (!ignore_step_p)
3374     {
3375       /* Convert the checks for nonzero steps into bound tests.  */
3376       tree value;
3377       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3378         vect_check_lower_bound (loop_vinfo, value, true, 1);
3379     }
3380
3381   if (may_alias_ddrs.is_empty ())
3382     return true;
3383
3384   comp_alias_ddrs.create (may_alias_ddrs.length ());
3385
3386   unsigned int loop_depth
3387     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3388                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3389
3390   /* First, we collect all data ref pairs for aliasing checks.  */
3391   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3392     {
3393       int comp_res;
3394       poly_uint64 lower_bound;
3395       struct data_reference *dr_a, *dr_b;
3396       gimple *dr_group_first_a, *dr_group_first_b;
3397       tree segment_length_a, segment_length_b;
3398       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3399       unsigned int align_a, align_b;
3400       gimple *stmt_a, *stmt_b;
3401
3402       /* Ignore the alias if the VF we chose ended up being no greater
3403          than the dependence distance.  */
3404       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3405         continue;
3406
3407       if (DDR_OBJECT_A (ddr))
3408         {
3409           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3410           if (!compared_objects.add (new_pair))
3411             {
3412               if (dump_enabled_p ())
3413                 {
3414                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3415                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3416                   dump_printf (MSG_NOTE, " and ");
3417                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3418                   dump_printf (MSG_NOTE, " have different addresses\n");
3419                 }
3420               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3421             }
3422           continue;
3423         }
3424
3425       dr_a = DDR_A (ddr);
3426       stmt_a = DR_STMT (DDR_A (ddr));
3427
3428       dr_b = DDR_B (ddr);
3429       stmt_b = DR_STMT (DDR_B (ddr));
3430
3431       /* Skip the pair if inter-iteration dependencies are irrelevant
3432          and intra-iteration dependencies are guaranteed to be honored.  */
3433       if (ignore_step_p
3434           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3435               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3436         {
3437           if (dump_enabled_p ())
3438             {
3439               dump_printf_loc (MSG_NOTE, vect_location,
3440                                "no need for alias check between ");
3441               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3442               dump_printf (MSG_NOTE, " and ");
3443               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3444               dump_printf (MSG_NOTE, " when VF is 1\n");
3445             }
3446           continue;
3447         }
3448
3449       /* See whether we can handle the alias using a bounds check on
3450          the step, and whether that's likely to be the best approach.
3451          (It might not be, for example, if the minimum step is much larger
3452          than the number of bytes handled by one vector iteration.)  */
3453       if (!ignore_step_p
3454           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3455           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3456           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3457               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3458         {
3459           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3460           if (dump_enabled_p ())
3461             {
3462               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3463               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3464               dump_printf (MSG_NOTE, " and ");
3465               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3466               dump_printf (MSG_NOTE, " when the step ");
3467               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3468               dump_printf (MSG_NOTE, " is outside ");
3469               if (unsigned_p)
3470                 dump_printf (MSG_NOTE, "[0");
3471               else
3472                 {
3473                   dump_printf (MSG_NOTE, "(");
3474                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3475                 }
3476               dump_printf (MSG_NOTE, ", ");
3477               dump_dec (MSG_NOTE, lower_bound);
3478               dump_printf (MSG_NOTE, ")\n");
3479             }
3480           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3481                                   lower_bound);
3482           continue;
3483         }
3484
3485       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3486       if (dr_group_first_a)
3487         {
3488           stmt_a = dr_group_first_a;
3489           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3490         }
3491
3492       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3493       if (dr_group_first_b)
3494         {
3495           stmt_b = dr_group_first_b;
3496           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3497         }
3498
3499       if (ignore_step_p)
3500         {
3501           segment_length_a = size_zero_node;
3502           segment_length_b = size_zero_node;
3503         }
3504       else
3505         {
3506           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3507             length_factor = scalar_loop_iters;
3508           else
3509             length_factor = size_int (vect_factor);
3510           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3511           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3512         }
3513       access_size_a = vect_vfa_access_size (dr_a);
3514       access_size_b = vect_vfa_access_size (dr_b);
3515       align_a = vect_vfa_align (dr_a);
3516       align_b = vect_vfa_align (dr_b);
3517
3518       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3519                                         DR_BASE_ADDRESS (dr_b));
3520       if (comp_res == 0)
3521         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3522                                           DR_OFFSET (dr_b));
3523
3524       /* See whether the alias is known at compilation time.  */
3525       if (comp_res == 0
3526           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3527           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3528           && poly_int_tree_p (segment_length_a)
3529           && poly_int_tree_p (segment_length_b))
3530         {
3531           int res = vect_compile_time_alias (dr_a, dr_b,
3532                                              segment_length_a,
3533                                              segment_length_b,
3534                                              access_size_a,
3535                                              access_size_b);
3536           if (res >= 0 && dump_enabled_p ())
3537             {
3538               dump_printf_loc (MSG_NOTE, vect_location,
3539                                "can tell at compile time that ");
3540               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3541               dump_printf (MSG_NOTE, " and ");
3542               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3543               if (res == 0)
3544                 dump_printf (MSG_NOTE, " do not alias\n");
3545               else
3546                 dump_printf (MSG_NOTE, " alias\n");
3547             }
3548
3549           if (res == 0)
3550             continue;
3551
3552           if (res == 1)
3553             {
3554               if (dump_enabled_p ())
3555                 dump_printf_loc (MSG_NOTE, vect_location,
3556                                  "not vectorized: compilation time alias.\n");
3557               return false;
3558             }
3559         }
3560
3561       dr_with_seg_len_pair_t dr_with_seg_len_pair
3562         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3563          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3564
3565       /* Canonicalize pairs by sorting the two DR members.  */
3566       if (comp_res > 0)
3567         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3568
3569       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3570     }
3571
3572   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3573
3574   unsigned int count = (comp_alias_ddrs.length ()
3575                         + check_unequal_addrs.length ());
3576
3577   dump_printf_loc (MSG_NOTE, vect_location,
3578                    "improved number of alias checks from %d to %d\n",
3579                    may_alias_ddrs.length (), count);
3580   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3581     {
3582       if (dump_enabled_p ())
3583         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584                          "number of versioning for alias "
3585                          "run-time tests exceeds %d "
3586                          "(--param vect-max-version-for-alias-checks)\n",
3587                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3588       return false;
3589     }
3590
3591   return true;
3592 }
3593
3594 /* Check whether we can use an internal function for a gather load
3595    or scatter store.  READ_P is true for loads and false for stores.
3596    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3597    the type of the memory elements being loaded or stored.  OFFSET_BITS
3598    is the number of bits in each scalar offset and OFFSET_SIGN is the
3599    sign of the offset.  SCALE is the amount by which the offset should
3600    be multiplied *after* it has been converted to address width.
3601
3602    Return true if the function is supported, storing the function
3603    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3604
3605 bool
3606 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3607                           tree memory_type, unsigned int offset_bits,
3608                           signop offset_sign, int scale,
3609                           internal_fn *ifn_out, tree *element_type_out)
3610 {
3611   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3612   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3613   if (offset_bits > element_bits)
3614     /* Internal functions require the offset to be the same width as
3615        the vector elements.  We can extend narrower offsets, but it isn't
3616        safe to truncate wider offsets.  */
3617     return false;
3618
3619   if (element_bits != memory_bits)
3620     /* For now the vector elements must be the same width as the
3621        memory elements.  */
3622     return false;
3623
3624   /* Work out which function we need.  */
3625   internal_fn ifn;
3626   if (read_p)
3627     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3628   else
3629     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3630
3631   /* Test whether the target supports this combination.  */
3632   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3633                                                offset_sign, scale))
3634     return false;
3635
3636   *ifn_out = ifn;
3637   *element_type_out = TREE_TYPE (vectype);
3638   return true;
3639 }
3640
3641 /* CALL is a call to an internal gather load or scatter store function.
3642    Describe the operation in INFO.  */
3643
3644 static void
3645 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3646 {
3647   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3648   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3649   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3650
3651   info->ifn = gimple_call_internal_fn (call);
3652   info->decl = NULL_TREE;
3653   info->base = gimple_call_arg (call, 0);
3654   info->offset = gimple_call_arg (call, 1);
3655   info->offset_dt = vect_unknown_def_type;
3656   info->offset_vectype = NULL_TREE;
3657   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3658   info->element_type = TREE_TYPE (vectype);
3659   info->memory_type = TREE_TYPE (DR_REF (dr));
3660 }
3661
3662 /* Return true if a non-affine read or write in STMT is suitable for a
3663    gather load or scatter store.  Describe the operation in *INFO if so.  */
3664
3665 bool
3666 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3667                            gather_scatter_info *info)
3668 {
3669   HOST_WIDE_INT scale = 1;
3670   poly_int64 pbitpos, pbitsize;
3671   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3672   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3673   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3674   tree offtype = NULL_TREE;
3675   tree decl = NULL_TREE, base, off;
3676   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3677   tree memory_type = TREE_TYPE (DR_REF (dr));
3678   machine_mode pmode;
3679   int punsignedp, reversep, pvolatilep = 0;
3680   internal_fn ifn;
3681   tree element_type;
3682   bool masked_p = false;
3683
3684   /* See whether this is already a call to a gather/scatter internal function.
3685      If not, see whether it's a masked load or store.  */
3686   gcall *call = dyn_cast <gcall *> (stmt);
3687   if (call && gimple_call_internal_p (call))
3688     {
3689       ifn = gimple_call_internal_fn (stmt);
3690       if (internal_gather_scatter_fn_p (ifn))
3691         {
3692           vect_describe_gather_scatter_call (call, info);
3693           return true;
3694         }
3695       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3696     }
3697
3698   /* True if we should aim to use internal functions rather than
3699      built-in functions.  */
3700   bool use_ifn_p = (DR_IS_READ (dr)
3701                     ? supports_vec_gather_load_p ()
3702                     : supports_vec_scatter_store_p ());
3703
3704   base = DR_REF (dr);
3705   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3706      see if we can use the def stmt of the address.  */
3707   if (masked_p
3708       && TREE_CODE (base) == MEM_REF
3709       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3710       && integer_zerop (TREE_OPERAND (base, 1))
3711       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3712     {
3713       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3714       if (is_gimple_assign (def_stmt)
3715           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3716         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3717     }
3718
3719   /* The gather and scatter builtins need address of the form
3720      loop_invariant + vector * {1, 2, 4, 8}
3721      or
3722      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3723      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3724      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3725      multiplications and additions in it.  To get a vector, we need
3726      a single SSA_NAME that will be defined in the loop and will
3727      contain everything that is not loop invariant and that can be
3728      vectorized.  The following code attempts to find such a preexistng
3729      SSA_NAME OFF and put the loop invariants into a tree BASE
3730      that can be gimplified before the loop.  */
3731   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3732                               &punsignedp, &reversep, &pvolatilep);
3733   gcc_assert (base && !reversep);
3734   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3735
3736   if (TREE_CODE (base) == MEM_REF)
3737     {
3738       if (!integer_zerop (TREE_OPERAND (base, 1)))
3739         {
3740           if (off == NULL_TREE)
3741             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3742           else
3743             off = size_binop (PLUS_EXPR, off,
3744                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3745         }
3746       base = TREE_OPERAND (base, 0);
3747     }
3748   else
3749     base = build_fold_addr_expr (base);
3750
3751   if (off == NULL_TREE)
3752     off = size_zero_node;
3753
3754   /* If base is not loop invariant, either off is 0, then we start with just
3755      the constant offset in the loop invariant BASE and continue with base
3756      as OFF, otherwise give up.
3757      We could handle that case by gimplifying the addition of base + off
3758      into some SSA_NAME and use that as off, but for now punt.  */
3759   if (!expr_invariant_in_loop_p (loop, base))
3760     {
3761       if (!integer_zerop (off))
3762         return false;
3763       off = base;
3764       base = size_int (pbytepos);
3765     }
3766   /* Otherwise put base + constant offset into the loop invariant BASE
3767      and continue with OFF.  */
3768   else
3769     {
3770       base = fold_convert (sizetype, base);
3771       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3772     }
3773
3774   /* OFF at this point may be either a SSA_NAME or some tree expression
3775      from get_inner_reference.  Try to peel off loop invariants from it
3776      into BASE as long as possible.  */
3777   STRIP_NOPS (off);
3778   while (offtype == NULL_TREE)
3779     {
3780       enum tree_code code;
3781       tree op0, op1, add = NULL_TREE;
3782
3783       if (TREE_CODE (off) == SSA_NAME)
3784         {
3785           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3786
3787           if (expr_invariant_in_loop_p (loop, off))
3788             return false;
3789
3790           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3791             break;
3792
3793           op0 = gimple_assign_rhs1 (def_stmt);
3794           code = gimple_assign_rhs_code (def_stmt);
3795           op1 = gimple_assign_rhs2 (def_stmt);
3796         }
3797       else
3798         {
3799           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3800             return false;
3801           code = TREE_CODE (off);
3802           extract_ops_from_tree (off, &code, &op0, &op1);
3803         }
3804       switch (code)
3805         {
3806         case POINTER_PLUS_EXPR:
3807         case PLUS_EXPR:
3808           if (expr_invariant_in_loop_p (loop, op0))
3809             {
3810               add = op0;
3811               off = op1;
3812             do_add:
3813               add = fold_convert (sizetype, add);
3814               if (scale != 1)
3815                 add = size_binop (MULT_EXPR, add, size_int (scale));
3816               base = size_binop (PLUS_EXPR, base, add);
3817               continue;
3818             }
3819           if (expr_invariant_in_loop_p (loop, op1))
3820             {
3821               add = op1;
3822               off = op0;
3823               goto do_add;
3824             }
3825           break;
3826         case MINUS_EXPR:
3827           if (expr_invariant_in_loop_p (loop, op1))
3828             {
3829               add = fold_convert (sizetype, op1);
3830               add = size_binop (MINUS_EXPR, size_zero_node, add);
3831               off = op0;
3832               goto do_add;
3833             }
3834           break;
3835         case MULT_EXPR:
3836           if (scale == 1 && tree_fits_shwi_p (op1))
3837             {
3838               int new_scale = tree_to_shwi (op1);
3839               /* Only treat this as a scaling operation if the target
3840                  supports it.  */
3841               if (use_ifn_p
3842                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3843                                                 vectype, memory_type, 1,
3844                                                 TYPE_SIGN (TREE_TYPE (op0)),
3845                                                 new_scale, &ifn,
3846                                                 &element_type))
3847                 break;
3848               scale = new_scale;
3849               off = op0;
3850               continue;
3851             }
3852           break;
3853         case SSA_NAME:
3854           off = op0;
3855           continue;
3856         CASE_CONVERT:
3857           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3858               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3859             break;
3860           if (TYPE_PRECISION (TREE_TYPE (op0))
3861               == TYPE_PRECISION (TREE_TYPE (off)))
3862             {
3863               off = op0;
3864               continue;
3865             }
3866
3867           /* The internal functions need the offset to be the same width
3868              as the elements of VECTYPE.  Don't include operations that
3869              cast the offset from that width to a different width.  */
3870           if (use_ifn_p
3871               && (int_size_in_bytes (TREE_TYPE (vectype))
3872                   == int_size_in_bytes (TREE_TYPE (off))))
3873             break;
3874
3875           if (TYPE_PRECISION (TREE_TYPE (op0))
3876               < TYPE_PRECISION (TREE_TYPE (off)))
3877             {
3878               off = op0;
3879               offtype = TREE_TYPE (off);
3880               STRIP_NOPS (off);
3881               continue;
3882             }
3883           break;
3884         default:
3885           break;
3886         }
3887       break;
3888     }
3889
3890   /* If at the end OFF still isn't a SSA_NAME or isn't
3891      defined in the loop, punt.  */
3892   if (TREE_CODE (off) != SSA_NAME
3893       || expr_invariant_in_loop_p (loop, off))
3894     return false;
3895
3896   if (offtype == NULL_TREE)
3897     offtype = TREE_TYPE (off);
3898
3899   if (use_ifn_p)
3900     {
3901       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3902                                      memory_type, TYPE_PRECISION (offtype),
3903                                      TYPE_SIGN (offtype), scale, &ifn,
3904                                      &element_type))
3905         return false;
3906     }
3907   else
3908     {
3909       if (DR_IS_READ (dr))
3910         {
3911           if (targetm.vectorize.builtin_gather)
3912             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3913         }
3914       else
3915         {
3916           if (targetm.vectorize.builtin_scatter)
3917             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3918         }
3919
3920       if (!decl)
3921         return false;
3922
3923       ifn = IFN_LAST;
3924       element_type = TREE_TYPE (vectype);
3925     }
3926
3927   info->ifn = ifn;
3928   info->decl = decl;
3929   info->base = base;
3930   info->offset = off;
3931   info->offset_dt = vect_unknown_def_type;
3932   info->offset_vectype = NULL_TREE;
3933   info->scale = scale;
3934   info->element_type = element_type;
3935   info->memory_type = memory_type;
3936   return true;
3937 }
3938
3939 /* Function vect_analyze_data_refs.
3940
3941   Find all the data references in the loop or basic block.
3942
3943    The general structure of the analysis of data refs in the vectorizer is as
3944    follows:
3945    1- vect_analyze_data_refs(loop/bb): call
3946       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3947       in the loop/bb and their dependences.
3948    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3949    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3950    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3951
3952 */
3953
3954 bool
3955 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3956 {
3957   struct loop *loop = NULL;
3958   unsigned int i;
3959   struct data_reference *dr;
3960   tree scalar_type;
3961
3962   if (dump_enabled_p ())
3963     dump_printf_loc (MSG_NOTE, vect_location,
3964                      "=== vect_analyze_data_refs ===\n");
3965
3966   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3967     loop = LOOP_VINFO_LOOP (loop_vinfo);
3968
3969   /* Go through the data-refs, check that the analysis succeeded.  Update
3970      pointer from stmt_vec_info struct to DR and vectype.  */
3971
3972   vec<data_reference_p> datarefs = vinfo->datarefs;
3973   FOR_EACH_VEC_ELT (datarefs, i, dr)
3974     {
3975       gimple *stmt;
3976       stmt_vec_info stmt_info;
3977       tree base, offset, init;
3978       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3979       bool simd_lane_access = false;
3980       poly_uint64 vf;
3981
3982 again:
3983       if (!dr || !DR_REF (dr))
3984         {
3985           if (dump_enabled_p ())
3986             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987                              "not vectorized: unhandled data-ref\n");
3988           return false;
3989         }
3990
3991       stmt = DR_STMT (dr);
3992       stmt_info = vinfo_for_stmt (stmt);
3993
3994       /* Discard clobbers from the dataref vector.  We will remove
3995          clobber stmts during vectorization.  */
3996       if (gimple_clobber_p (stmt))
3997         {
3998           free_data_ref (dr);
3999           if (i == datarefs.length () - 1)
4000             {
4001               datarefs.pop ();
4002               break;
4003             }
4004           datarefs.ordered_remove (i);
4005           dr = datarefs[i];
4006           goto again;
4007         }
4008
4009       /* Check that analysis of the data-ref succeeded.  */
4010       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4011           || !DR_STEP (dr))
4012         {
4013           bool maybe_gather
4014             = DR_IS_READ (dr)
4015               && !TREE_THIS_VOLATILE (DR_REF (dr))
4016               && (targetm.vectorize.builtin_gather != NULL
4017                   || supports_vec_gather_load_p ());
4018           bool maybe_scatter
4019             = DR_IS_WRITE (dr)
4020               && !TREE_THIS_VOLATILE (DR_REF (dr))
4021               && (targetm.vectorize.builtin_scatter != NULL
4022                   || supports_vec_scatter_store_p ());
4023           bool maybe_simd_lane_access
4024             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4025
4026           /* If target supports vector gather loads or scatter stores, or if
4027              this might be a SIMD lane access, see if they can't be used.  */
4028           if (is_a <loop_vec_info> (vinfo)
4029               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4030               && !nested_in_vect_loop_p (loop, stmt))
4031             {
4032               struct data_reference *newdr
4033                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4034                                    DR_REF (dr), stmt, !maybe_scatter,
4035                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4036               gcc_assert (newdr != NULL && DR_REF (newdr));
4037               if (DR_BASE_ADDRESS (newdr)
4038                   && DR_OFFSET (newdr)
4039                   && DR_INIT (newdr)
4040                   && DR_STEP (newdr)
4041                   && integer_zerop (DR_STEP (newdr)))
4042                 {
4043                   if (maybe_simd_lane_access)
4044                     {
4045                       tree off = DR_OFFSET (newdr);
4046                       STRIP_NOPS (off);
4047                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4048                           && TREE_CODE (off) == MULT_EXPR
4049                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4050                         {
4051                           tree step = TREE_OPERAND (off, 1);
4052                           off = TREE_OPERAND (off, 0);
4053                           STRIP_NOPS (off);
4054                           if (CONVERT_EXPR_P (off)
4055                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4056                                                                           0)))
4057                                  < TYPE_PRECISION (TREE_TYPE (off)))
4058                             off = TREE_OPERAND (off, 0);
4059                           if (TREE_CODE (off) == SSA_NAME)
4060                             {
4061                               gimple *def = SSA_NAME_DEF_STMT (off);
4062                               tree reft = TREE_TYPE (DR_REF (newdr));
4063                               if (is_gimple_call (def)
4064                                   && gimple_call_internal_p (def)
4065                                   && (gimple_call_internal_fn (def)
4066                                       == IFN_GOMP_SIMD_LANE))
4067                                 {
4068                                   tree arg = gimple_call_arg (def, 0);
4069                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4070                                   arg = SSA_NAME_VAR (arg);
4071                                   if (arg == loop->simduid
4072                                       /* For now.  */
4073                                       && tree_int_cst_equal
4074                                            (TYPE_SIZE_UNIT (reft),
4075                                             step))
4076                                     {
4077                                       DR_OFFSET (newdr) = ssize_int (0);
4078                                       DR_STEP (newdr) = step;
4079                                       DR_OFFSET_ALIGNMENT (newdr)
4080                                         = BIGGEST_ALIGNMENT;
4081                                       DR_STEP_ALIGNMENT (newdr)
4082                                         = highest_pow2_factor (step);
4083                                       dr = newdr;
4084                                       simd_lane_access = true;
4085                                     }
4086                                 }
4087                             }
4088                         }
4089                     }
4090                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4091                     {
4092                       dr = newdr;
4093                       if (maybe_gather)
4094                         gatherscatter = GATHER;
4095                       else
4096                         gatherscatter = SCATTER;
4097                     }
4098                 }
4099               if (gatherscatter == SG_NONE && !simd_lane_access)
4100                 free_data_ref (newdr);
4101             }
4102
4103           if (gatherscatter == SG_NONE && !simd_lane_access)
4104             {
4105               if (dump_enabled_p ())
4106                 {
4107                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4108                                    "not vectorized: data ref analysis "
4109                                    "failed ");
4110                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4111                 }
4112
4113               if (is_a <bb_vec_info> (vinfo))
4114                 break;
4115
4116               return false;
4117             }
4118         }
4119
4120       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4121         {
4122           if (dump_enabled_p ())
4123             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4124                              "not vectorized: base addr of dr is a "
4125                              "constant\n");
4126
4127           if (is_a <bb_vec_info> (vinfo))
4128             break;
4129
4130           if (gatherscatter != SG_NONE || simd_lane_access)
4131             free_data_ref (dr);
4132           return false;
4133         }
4134
4135       if (TREE_THIS_VOLATILE (DR_REF (dr)))
4136         {
4137           if (dump_enabled_p ())
4138             {
4139               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4140                                "not vectorized: volatile type ");
4141               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4142             }
4143
4144           if (is_a <bb_vec_info> (vinfo))
4145             break;
4146
4147           return false;
4148         }
4149
4150       if (stmt_can_throw_internal (stmt))
4151         {
4152           if (dump_enabled_p ())
4153             {
4154               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4155                                "not vectorized: statement can throw an "
4156                                "exception ");
4157               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4158             }
4159
4160           if (is_a <bb_vec_info> (vinfo))
4161             break;
4162
4163           if (gatherscatter != SG_NONE || simd_lane_access)
4164             free_data_ref (dr);
4165           return false;
4166         }
4167
4168       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4169           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4170         {
4171           if (dump_enabled_p ())
4172             {
4173               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4174                                "not vectorized: statement is bitfield "
4175                                "access ");
4176               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4177             }
4178
4179           if (is_a <bb_vec_info> (vinfo))
4180             break;
4181
4182           if (gatherscatter != SG_NONE || simd_lane_access)
4183             free_data_ref (dr);
4184           return false;
4185         }
4186
4187       base = unshare_expr (DR_BASE_ADDRESS (dr));
4188       offset = unshare_expr (DR_OFFSET (dr));
4189       init = unshare_expr (DR_INIT (dr));
4190
4191       if (is_gimple_call (stmt)
4192           && (!gimple_call_internal_p (stmt)
4193               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
4194                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
4195         {
4196           if (dump_enabled_p ())
4197             {
4198               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4199                                "not vectorized: dr in a call ");
4200               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4201             }
4202
4203           if (is_a <bb_vec_info> (vinfo))
4204             break;
4205
4206           if (gatherscatter != SG_NONE || simd_lane_access)
4207             free_data_ref (dr);
4208           return false;
4209         }
4210
4211       /* Update DR field in stmt_vec_info struct.  */
4212
4213       /* If the dataref is in an inner-loop of the loop that is considered for
4214          for vectorization, we also want to analyze the access relative to
4215          the outer-loop (DR contains information only relative to the
4216          inner-most enclosing loop).  We do that by building a reference to the
4217          first location accessed by the inner-loop, and analyze it relative to
4218          the outer-loop.  */
4219       if (loop && nested_in_vect_loop_p (loop, stmt))
4220         {
4221           /* Build a reference to the first location accessed by the
4222              inner loop: *(BASE + INIT + OFFSET).  By construction,
4223              this address must be invariant in the inner loop, so we
4224              can consider it as being used in the outer loop.  */
4225           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4226                                           init, offset);
4227           tree init_addr = fold_build_pointer_plus (base, init_offset);
4228           tree init_ref = build_fold_indirect_ref (init_addr);
4229
4230           if (dump_enabled_p ())
4231             {
4232               dump_printf_loc (MSG_NOTE, vect_location,
4233                                "analyze in outer loop: ");
4234               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4235               dump_printf (MSG_NOTE, "\n");
4236             }
4237
4238           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4239                                      init_ref, loop))
4240             /* dr_analyze_innermost already explained the failure.  */
4241             return false;
4242
4243           if (dump_enabled_p ())
4244             {
4245               dump_printf_loc (MSG_NOTE, vect_location,
4246                                "\touter base_address: ");
4247               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4248                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4249               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4250               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4251                                  STMT_VINFO_DR_OFFSET (stmt_info));
4252               dump_printf (MSG_NOTE,
4253                            "\n\touter constant offset from base address: ");
4254               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4255                                  STMT_VINFO_DR_INIT (stmt_info));
4256               dump_printf (MSG_NOTE, "\n\touter step: ");
4257               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4258                                  STMT_VINFO_DR_STEP (stmt_info));
4259               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4260                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4261               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4262                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4263               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4264                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4265               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4266                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4267             }
4268         }
4269
4270       if (STMT_VINFO_DATA_REF (stmt_info))
4271         {
4272           if (dump_enabled_p ())
4273             {
4274               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4275                                "not vectorized: more than one data ref "
4276                                "in stmt: ");
4277               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4278             }
4279
4280           if (is_a <bb_vec_info> (vinfo))
4281             break;
4282
4283           if (gatherscatter != SG_NONE || simd_lane_access)
4284             free_data_ref (dr);
4285           return false;
4286         }
4287
4288       STMT_VINFO_DATA_REF (stmt_info) = dr;
4289       if (simd_lane_access)
4290         {
4291           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4292           free_data_ref (datarefs[i]);
4293           datarefs[i] = dr;
4294         }
4295
4296       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4297           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4298           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4299         {
4300           if (dump_enabled_p ())
4301             {
4302               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4303                                "not vectorized: base object not addressable "
4304                                "for stmt: ");
4305               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4306             }
4307           if (is_a <bb_vec_info> (vinfo))
4308             {
4309               /* In BB vectorization the ref can still participate
4310                  in dependence analysis, we just can't vectorize it.  */
4311               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4312               continue;
4313             }
4314           return false;
4315         }
4316
4317       /* Set vectype for STMT.  */
4318       scalar_type = TREE_TYPE (DR_REF (dr));
4319       STMT_VINFO_VECTYPE (stmt_info)
4320         = get_vectype_for_scalar_type (scalar_type);
4321       if (!STMT_VINFO_VECTYPE (stmt_info))
4322         {
4323           if (dump_enabled_p ())
4324             {
4325               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4326                                "not vectorized: no vectype for stmt: ");
4327               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4328               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4329               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4330                                  scalar_type);
4331               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4332             }
4333
4334           if (is_a <bb_vec_info> (vinfo))
4335             {
4336               /* No vector type is fine, the ref can still participate
4337                  in dependence analysis, we just can't vectorize it.  */
4338               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4339               continue;
4340             }
4341
4342           if (gatherscatter != SG_NONE || simd_lane_access)
4343             {
4344               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4345               if (gatherscatter != SG_NONE)
4346                 free_data_ref (dr);
4347             }
4348           return false;
4349         }
4350       else
4351         {
4352           if (dump_enabled_p ())
4353             {
4354               dump_printf_loc (MSG_NOTE, vect_location,
4355                                "got vectype for stmt: ");
4356               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4357               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4358                                  STMT_VINFO_VECTYPE (stmt_info));
4359               dump_printf (MSG_NOTE, "\n");
4360             }
4361         }
4362
4363       /* Adjust the minimal vectorization factor according to the
4364          vector type.  */
4365       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4366       *min_vf = upper_bound (*min_vf, vf);
4367
4368       if (gatherscatter != SG_NONE)
4369         {
4370           gather_scatter_info gs_info;
4371           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4372                                           &gs_info)
4373               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4374             {
4375               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4376               free_data_ref (dr);
4377               if (dump_enabled_p ())
4378                 {
4379                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4380                                    (gatherscatter == GATHER) ?
4381                                    "not vectorized: not suitable for gather "
4382                                    "load " :
4383                                    "not vectorized: not suitable for scatter "
4384                                    "store ");
4385                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4386                 }
4387               return false;
4388             }
4389
4390           free_data_ref (datarefs[i]);
4391           datarefs[i] = dr;
4392           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4393         }
4394
4395       else if (is_a <loop_vec_info> (vinfo)
4396                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4397         {
4398           if (nested_in_vect_loop_p (loop, stmt))
4399             {
4400               if (dump_enabled_p ())
4401                 {
4402                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4403                                    "not vectorized: not suitable for strided "
4404                                    "load ");
4405                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4406                 }
4407               return false;
4408             }
4409           STMT_VINFO_STRIDED_P (stmt_info) = true;
4410         }
4411     }
4412
4413   /* If we stopped analysis at the first dataref we could not analyze
4414      when trying to vectorize a basic-block mark the rest of the datarefs
4415      as not vectorizable and truncate the vector of datarefs.  That
4416      avoids spending useless time in analyzing their dependence.  */
4417   if (i != datarefs.length ())
4418     {
4419       gcc_assert (is_a <bb_vec_info> (vinfo));
4420       for (unsigned j = i; j < datarefs.length (); ++j)
4421         {
4422           data_reference_p dr = datarefs[j];
4423           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4424           free_data_ref (dr);
4425         }
4426       datarefs.truncate (i);
4427     }
4428
4429   return true;
4430 }
4431
4432
4433 /* Function vect_get_new_vect_var.
4434
4435    Returns a name for a new variable.  The current naming scheme appends the
4436    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4437    the name of vectorizer generated variables, and appends that to NAME if
4438    provided.  */
4439
4440 tree
4441 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4442 {
4443   const char *prefix;
4444   tree new_vect_var;
4445
4446   switch (var_kind)
4447   {
4448   case vect_simple_var:
4449     prefix = "vect";
4450     break;
4451   case vect_scalar_var:
4452     prefix = "stmp";
4453     break;
4454   case vect_mask_var:
4455     prefix = "mask";
4456     break;
4457   case vect_pointer_var:
4458     prefix = "vectp";
4459     break;
4460   default:
4461     gcc_unreachable ();
4462   }
4463
4464   if (name)
4465     {
4466       char* tmp = concat (prefix, "_", name, NULL);
4467       new_vect_var = create_tmp_reg (type, tmp);
4468       free (tmp);
4469     }
4470   else
4471     new_vect_var = create_tmp_reg (type, prefix);
4472
4473   return new_vect_var;
4474 }
4475
4476 /* Like vect_get_new_vect_var but return an SSA name.  */
4477
4478 tree
4479 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4480 {
4481   const char *prefix;
4482   tree new_vect_var;
4483
4484   switch (var_kind)
4485   {
4486   case vect_simple_var:
4487     prefix = "vect";
4488     break;
4489   case vect_scalar_var:
4490     prefix = "stmp";
4491     break;
4492   case vect_pointer_var:
4493     prefix = "vectp";
4494     break;
4495   default:
4496     gcc_unreachable ();
4497   }
4498
4499   if (name)
4500     {
4501       char* tmp = concat (prefix, "_", name, NULL);
4502       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4503       free (tmp);
4504     }
4505   else
4506     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4507
4508   return new_vect_var;
4509 }
4510
4511 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4512
4513 static void
4514 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4515 {
4516   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4517   int misalign = DR_MISALIGNMENT (dr);
4518   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4519     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4520   else
4521     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4522                             DR_TARGET_ALIGNMENT (dr), misalign);
4523 }
4524
4525 /* Function vect_create_addr_base_for_vector_ref.
4526
4527    Create an expression that computes the address of the first memory location
4528    that will be accessed for a data reference.
4529
4530    Input:
4531    STMT: The statement containing the data reference.
4532    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4533    OFFSET: Optional. If supplied, it is be added to the initial address.
4534    LOOP:    Specify relative to which loop-nest should the address be computed.
4535             For example, when the dataref is in an inner-loop nested in an
4536             outer-loop that is now being vectorized, LOOP can be either the
4537             outer-loop, or the inner-loop.  The first memory location accessed
4538             by the following dataref ('in' points to short):
4539
4540                 for (i=0; i<N; i++)
4541                    for (j=0; j<M; j++)
4542                      s += in[i+j]
4543
4544             is as follows:
4545             if LOOP=i_loop:     &in             (relative to i_loop)
4546             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4547    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4548             initial address.  Unlike OFFSET, which is number of elements to
4549             be added, BYTE_OFFSET is measured in bytes.
4550
4551    Output:
4552    1. Return an SSA_NAME whose value is the address of the memory location of
4553       the first vector of the data reference.
4554    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4555       these statement(s) which define the returned SSA_NAME.
4556
4557    FORNOW: We are only handling array accesses with step 1.  */
4558
4559 tree
4560 vect_create_addr_base_for_vector_ref (gimple *stmt,
4561                                       gimple_seq *new_stmt_list,
4562                                       tree offset,
4563                                       tree byte_offset)
4564 {
4565   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4566   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4567   const char *base_name;
4568   tree addr_base;
4569   tree dest;
4570   gimple_seq seq = NULL;
4571   tree vect_ptr_type;
4572   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4573   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4574   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4575
4576   tree data_ref_base = unshare_expr (drb->base_address);
4577   tree base_offset = unshare_expr (drb->offset);
4578   tree init = unshare_expr (drb->init);
4579
4580   if (loop_vinfo)
4581     base_name = get_name (data_ref_base);
4582   else
4583     {
4584       base_offset = ssize_int (0);
4585       init = ssize_int (0);
4586       base_name = get_name (DR_REF (dr));
4587     }
4588
4589   /* Create base_offset */
4590   base_offset = size_binop (PLUS_EXPR,
4591                             fold_convert (sizetype, base_offset),
4592                             fold_convert (sizetype, init));
4593
4594   if (offset)
4595     {
4596       offset = fold_build2 (MULT_EXPR, sizetype,
4597                             fold_convert (sizetype, offset), step);
4598       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4599                                  base_offset, offset);
4600     }
4601   if (byte_offset)
4602     {
4603       byte_offset = fold_convert (sizetype, byte_offset);
4604       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4605                                  base_offset, byte_offset);
4606     }
4607
4608   /* base + base_offset */
4609   if (loop_vinfo)
4610     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4611   else
4612     {
4613       addr_base = build1 (ADDR_EXPR,
4614                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4615                           unshare_expr (DR_REF (dr)));
4616     }
4617
4618   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4619   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4620   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4621   gimple_seq_add_seq (new_stmt_list, seq);
4622
4623   if (DR_PTR_INFO (dr)
4624       && TREE_CODE (addr_base) == SSA_NAME
4625       && !SSA_NAME_PTR_INFO (addr_base))
4626     {
4627       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4628       if (offset || byte_offset)
4629         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4630     }
4631
4632   if (dump_enabled_p ())
4633     {
4634       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4635       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4636       dump_printf (MSG_NOTE, "\n");
4637     }
4638
4639   return addr_base;
4640 }
4641
4642
4643 /* Function vect_create_data_ref_ptr.
4644
4645    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4646    location accessed in the loop by STMT, along with the def-use update
4647    chain to appropriately advance the pointer through the loop iterations.
4648    Also set aliasing information for the pointer.  This pointer is used by
4649    the callers to this function to create a memory reference expression for
4650    vector load/store access.
4651
4652    Input:
4653    1. STMT: a stmt that references memory. Expected to be of the form
4654          GIMPLE_ASSIGN <name, data-ref> or
4655          GIMPLE_ASSIGN <data-ref, name>.
4656    2. AGGR_TYPE: the type of the reference, which should be either a vector
4657         or an array.
4658    3. AT_LOOP: the loop where the vector memref is to be created.
4659    4. OFFSET (optional): an offset to be added to the initial address accessed
4660         by the data-ref in STMT.
4661    5. BSI: location where the new stmts are to be placed if there is no loop
4662    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4663         pointing to the initial address.
4664    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4665         to the initial address accessed by the data-ref in STMT.  This is
4666         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4667         in bytes.
4668    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4669         to the IV during each iteration of the loop.  NULL says to move
4670         by one copy of AGGR_TYPE up or down, depending on the step of the
4671         data reference.
4672
4673    Output:
4674    1. Declare a new ptr to vector_type, and have it point to the base of the
4675       data reference (initial addressed accessed by the data reference).
4676       For example, for vector of type V8HI, the following code is generated:
4677
4678       v8hi *ap;
4679       ap = (v8hi *)initial_address;
4680
4681       if OFFSET is not supplied:
4682          initial_address = &a[init];
4683       if OFFSET is supplied:
4684          initial_address = &a[init + OFFSET];
4685       if BYTE_OFFSET is supplied:
4686          initial_address = &a[init] + BYTE_OFFSET;
4687
4688       Return the initial_address in INITIAL_ADDRESS.
4689
4690    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4691       update the pointer in each iteration of the loop.
4692
4693       Return the increment stmt that updates the pointer in PTR_INCR.
4694
4695    3. Set INV_P to true if the access pattern of the data reference in the
4696       vectorized loop is invariant.  Set it to false otherwise.
4697
4698    4. Return the pointer.  */
4699
4700 tree
4701 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4702                           tree offset, tree *initial_address,
4703                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4704                           bool only_init, bool *inv_p, tree byte_offset,
4705                           tree iv_step)
4706 {
4707   const char *base_name;
4708   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4709   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4710   struct loop *loop = NULL;
4711   bool nested_in_vect_loop = false;
4712   struct loop *containing_loop = NULL;
4713   tree aggr_ptr_type;
4714   tree aggr_ptr;
4715   tree new_temp;
4716   gimple_seq new_stmt_list = NULL;
4717   edge pe = NULL;
4718   basic_block new_bb;
4719   tree aggr_ptr_init;
4720   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4721   tree aptr;
4722   gimple_stmt_iterator incr_gsi;
4723   bool insert_after;
4724   tree indx_before_incr, indx_after_incr;
4725   gimple *incr;
4726   tree step;
4727   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4728
4729   gcc_assert (iv_step != NULL_TREE
4730               || TREE_CODE (aggr_type) == ARRAY_TYPE
4731               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4732
4733   if (loop_vinfo)
4734     {
4735       loop = LOOP_VINFO_LOOP (loop_vinfo);
4736       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4737       containing_loop = (gimple_bb (stmt))->loop_father;
4738       pe = loop_preheader_edge (loop);
4739     }
4740   else
4741     {
4742       gcc_assert (bb_vinfo);
4743       only_init = true;
4744       *ptr_incr = NULL;
4745     }
4746
4747   /* Check the step (evolution) of the load in LOOP, and record
4748      whether it's invariant.  */
4749   step = vect_dr_behavior (dr)->step;
4750   if (integer_zerop (step))
4751     *inv_p = true;
4752   else
4753     *inv_p = false;
4754
4755   /* Create an expression for the first address accessed by this load
4756      in LOOP.  */
4757   base_name = get_name (DR_BASE_ADDRESS (dr));
4758
4759   if (dump_enabled_p ())
4760     {
4761       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4762       dump_printf_loc (MSG_NOTE, vect_location,
4763                        "create %s-pointer variable to type: ",
4764                        get_tree_code_name (TREE_CODE (aggr_type)));
4765       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4766       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4767         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4768       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4769         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4770       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4771         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4772       else
4773         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4774       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4775       dump_printf (MSG_NOTE, "\n");
4776     }
4777
4778   /* (1) Create the new aggregate-pointer variable.
4779      Vector and array types inherit the alias set of their component
4780      type by default so we need to use a ref-all pointer if the data
4781      reference does not conflict with the created aggregated data
4782      reference because it is not addressable.  */
4783   bool need_ref_all = false;
4784   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4785                               get_alias_set (DR_REF (dr))))
4786     need_ref_all = true;
4787   /* Likewise for any of the data references in the stmt group.  */
4788   else if (DR_GROUP_SIZE (stmt_info) > 1)
4789     {
4790       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4791       do
4792         {
4793           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4794           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4795           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4796                                       get_alias_set (DR_REF (sdr))))
4797             {
4798               need_ref_all = true;
4799               break;
4800             }
4801           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4802         }
4803       while (orig_stmt);
4804     }
4805   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4806                                                need_ref_all);
4807   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4808
4809
4810   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4811      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4812      def-use update cycles for the pointer: one relative to the outer-loop
4813      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4814      to the inner-loop (which is the inner-most loop containing the dataref),
4815      and this is done be step (5) below.
4816
4817      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4818      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4819      redundant.  Steps (3),(4) create the following:
4820
4821         vp0 = &base_addr;
4822         LOOP:   vp1 = phi(vp0,vp2)
4823                 ...
4824                 ...
4825                 vp2 = vp1 + step
4826                 goto LOOP
4827
4828      If there is an inner-loop nested in loop, then step (5) will also be
4829      applied, and an additional update in the inner-loop will be created:
4830
4831         vp0 = &base_addr;
4832         LOOP:   vp1 = phi(vp0,vp2)
4833                 ...
4834         inner:     vp3 = phi(vp1,vp4)
4835                    vp4 = vp3 + inner_step
4836                    if () goto inner
4837                 ...
4838                 vp2 = vp1 + step
4839                 if () goto LOOP   */
4840
4841   /* (2) Calculate the initial address of the aggregate-pointer, and set
4842      the aggregate-pointer to point to it before the loop.  */
4843
4844   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4845
4846   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4847                                                    offset, byte_offset);
4848   if (new_stmt_list)
4849     {
4850       if (pe)
4851         {
4852           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4853           gcc_assert (!new_bb);
4854         }
4855       else
4856         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4857     }
4858
4859   *initial_address = new_temp;
4860   aggr_ptr_init = new_temp;
4861
4862   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4863      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4864      inner-loop nested in LOOP (during outer-loop vectorization).  */
4865
4866   /* No update in loop is required.  */
4867   if (only_init && (!loop_vinfo || at_loop == loop))
4868     aptr = aggr_ptr_init;
4869   else
4870     {
4871       if (iv_step == NULL_TREE)
4872         {
4873           /* The step of the aggregate pointer is the type size.  */
4874           iv_step = TYPE_SIZE_UNIT (aggr_type);
4875           /* One exception to the above is when the scalar step of the load in
4876              LOOP is zero. In this case the step here is also zero.  */
4877           if (*inv_p)
4878             iv_step = size_zero_node;
4879           else if (tree_int_cst_sgn (step) == -1)
4880             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4881         }
4882
4883       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4884
4885       create_iv (aggr_ptr_init,
4886                  fold_convert (aggr_ptr_type, iv_step),
4887                  aggr_ptr, loop, &incr_gsi, insert_after,
4888                  &indx_before_incr, &indx_after_incr);
4889       incr = gsi_stmt (incr_gsi);
4890       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4891
4892       /* Copy the points-to information if it exists. */
4893       if (DR_PTR_INFO (dr))
4894         {
4895           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4896           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4897         }
4898       if (ptr_incr)
4899         *ptr_incr = incr;
4900
4901       aptr = indx_before_incr;
4902     }
4903
4904   if (!nested_in_vect_loop || only_init)
4905     return aptr;
4906
4907
4908   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4909      nested in LOOP, if exists.  */
4910
4911   gcc_assert (nested_in_vect_loop);
4912   if (!only_init)
4913     {
4914       standard_iv_increment_position (containing_loop, &incr_gsi,
4915                                       &insert_after);
4916       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4917                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4918                  &indx_after_incr);
4919       incr = gsi_stmt (incr_gsi);
4920       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4921
4922       /* Copy the points-to information if it exists. */
4923       if (DR_PTR_INFO (dr))
4924         {
4925           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4926           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4927         }
4928       if (ptr_incr)
4929         *ptr_incr = incr;
4930
4931       return indx_before_incr;
4932     }
4933   else
4934     gcc_unreachable ();
4935 }
4936
4937
4938 /* Function bump_vector_ptr
4939
4940    Increment a pointer (to a vector type) by vector-size. If requested,
4941    i.e. if PTR-INCR is given, then also connect the new increment stmt
4942    to the existing def-use update-chain of the pointer, by modifying
4943    the PTR_INCR as illustrated below:
4944
4945    The pointer def-use update-chain before this function:
4946                         DATAREF_PTR = phi (p_0, p_2)
4947                         ....
4948         PTR_INCR:       p_2 = DATAREF_PTR + step
4949
4950    The pointer def-use update-chain after this function:
4951                         DATAREF_PTR = phi (p_0, p_2)
4952                         ....
4953                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4954                         ....
4955         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4956
4957    Input:
4958    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4959                  in the loop.
4960    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4961               the loop.  The increment amount across iterations is expected
4962               to be vector_size.
4963    BSI - location where the new update stmt is to be placed.
4964    STMT - the original scalar memory-access stmt that is being vectorized.
4965    BUMP - optional. The offset by which to bump the pointer. If not given,
4966           the offset is assumed to be vector_size.
4967
4968    Output: Return NEW_DATAREF_PTR as illustrated above.
4969
4970 */
4971
4972 tree
4973 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4974                  gimple *stmt, tree bump)
4975 {
4976   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4977   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4978   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4979   tree update = TYPE_SIZE_UNIT (vectype);
4980   gassign *incr_stmt;
4981   ssa_op_iter iter;
4982   use_operand_p use_p;
4983   tree new_dataref_ptr;
4984
4985   if (bump)
4986     update = bump;
4987
4988   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4989     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4990   else
4991     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4992   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4993                                    dataref_ptr, update);
4994   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4995
4996   /* Copy the points-to information if it exists. */
4997   if (DR_PTR_INFO (dr))
4998     {
4999       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5000       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5001     }
5002
5003   if (!ptr_incr)
5004     return new_dataref_ptr;
5005
5006   /* Update the vector-pointer's cross-iteration increment.  */
5007   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5008     {
5009       tree use = USE_FROM_PTR (use_p);
5010
5011       if (use == dataref_ptr)
5012         SET_USE (use_p, new_dataref_ptr);
5013       else
5014         gcc_assert (operand_equal_p (use, update, 0));
5015     }
5016
5017   return new_dataref_ptr;
5018 }
5019
5020
5021 /* Copy memory reference info such as base/clique from the SRC reference
5022    to the DEST MEM_REF.  */
5023
5024 void
5025 vect_copy_ref_info (tree dest, tree src)
5026 {
5027   if (TREE_CODE (dest) != MEM_REF)
5028     return;
5029
5030   tree src_base = src;
5031   while (handled_component_p (src_base))
5032     src_base = TREE_OPERAND (src_base, 0);
5033   if (TREE_CODE (src_base) != MEM_REF
5034       && TREE_CODE (src_base) != TARGET_MEM_REF)
5035     return;
5036
5037   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5038   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5039 }
5040
5041
5042 /* Function vect_create_destination_var.
5043
5044    Create a new temporary of type VECTYPE.  */
5045
5046 tree
5047 vect_create_destination_var (tree scalar_dest, tree vectype)
5048 {
5049   tree vec_dest;
5050   const char *name;
5051   char *new_name;
5052   tree type;
5053   enum vect_var_kind kind;
5054
5055   kind = vectype
5056     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5057     ? vect_mask_var
5058     : vect_simple_var
5059     : vect_scalar_var;
5060   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5061
5062   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5063
5064   name = get_name (scalar_dest);
5065   if (name)
5066     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5067   else
5068     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5069   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5070   free (new_name);
5071
5072   return vec_dest;
5073 }
5074
5075 /* Function vect_grouped_store_supported.
5076
5077    Returns TRUE if interleave high and interleave low permutations
5078    are supported, and FALSE otherwise.  */
5079
5080 bool
5081 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5082 {
5083   machine_mode mode = TYPE_MODE (vectype);
5084
5085   /* vect_permute_store_chain requires the group size to be equal to 3 or
5086      be a power of two.  */
5087   if (count != 3 && exact_log2 (count) == -1)
5088     {
5089       if (dump_enabled_p ())
5090         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5091                          "the size of the group of accesses"
5092                          " is not a power of 2 or not eqaul to 3\n");
5093       return false;
5094     }
5095
5096   /* Check that the permutation is supported.  */
5097   if (VECTOR_MODE_P (mode))
5098     {
5099       unsigned int i;
5100       if (count == 3)
5101         {
5102           unsigned int j0 = 0, j1 = 0, j2 = 0;
5103           unsigned int i, j;
5104
5105           unsigned int nelt;
5106           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5107             {
5108               if (dump_enabled_p ())
5109                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5110                                  "cannot handle groups of 3 stores for"
5111                                  " variable-length vectors\n");
5112               return false;
5113             }
5114
5115           vec_perm_builder sel (nelt, nelt, 1);
5116           sel.quick_grow (nelt);
5117           vec_perm_indices indices;
5118           for (j = 0; j < 3; j++)
5119             {
5120               int nelt0 = ((3 - j) * nelt) % 3;
5121               int nelt1 = ((3 - j) * nelt + 1) % 3;
5122               int nelt2 = ((3 - j) * nelt + 2) % 3;
5123               for (i = 0; i < nelt; i++)
5124                 {
5125                   if (3 * i + nelt0 < nelt)
5126                     sel[3 * i + nelt0] = j0++;
5127                   if (3 * i + nelt1 < nelt)
5128                     sel[3 * i + nelt1] = nelt + j1++;
5129                   if (3 * i + nelt2 < nelt)
5130                     sel[3 * i + nelt2] = 0;
5131                 }
5132               indices.new_vector (sel, 2, nelt);
5133               if (!can_vec_perm_const_p (mode, indices))
5134                 {
5135                   if (dump_enabled_p ())
5136                     dump_printf (MSG_MISSED_OPTIMIZATION,
5137                                  "permutation op not supported by target.\n");
5138                   return false;
5139                 }
5140
5141               for (i = 0; i < nelt; i++)
5142                 {
5143                   if (3 * i + nelt0 < nelt)
5144                     sel[3 * i + nelt0] = 3 * i + nelt0;
5145                   if (3 * i + nelt1 < nelt)
5146                     sel[3 * i + nelt1] = 3 * i + nelt1;
5147                   if (3 * i + nelt2 < nelt)
5148                     sel[3 * i + nelt2] = nelt + j2++;
5149                 }
5150               indices.new_vector (sel, 2, nelt);
5151               if (!can_vec_perm_const_p (mode, indices))
5152                 {
5153                   if (dump_enabled_p ())
5154                     dump_printf (MSG_MISSED_OPTIMIZATION,
5155                                  "permutation op not supported by target.\n");
5156                   return false;
5157                 }
5158             }
5159           return true;
5160         }
5161       else
5162         {
5163           /* If length is not equal to 3 then only power of 2 is supported.  */
5164           gcc_assert (pow2p_hwi (count));
5165           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5166
5167           /* The encoding has 2 interleaved stepped patterns.  */
5168           vec_perm_builder sel (nelt, 2, 3);
5169           sel.quick_grow (6);
5170           for (i = 0; i < 3; i++)
5171             {
5172               sel[i * 2] = i;
5173               sel[i * 2 + 1] = i + nelt;
5174             }
5175           vec_perm_indices indices (sel, 2, nelt);
5176           if (can_vec_perm_const_p (mode, indices))
5177             {
5178               for (i = 0; i < 6; i++)
5179                 sel[i] += exact_div (nelt, 2);
5180               indices.new_vector (sel, 2, nelt);
5181               if (can_vec_perm_const_p (mode, indices))
5182                 return true;
5183             }
5184         }
5185     }
5186
5187   if (dump_enabled_p ())
5188     dump_printf (MSG_MISSED_OPTIMIZATION,
5189                  "permutaion op not supported by target.\n");
5190   return false;
5191 }
5192
5193
5194 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5195    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5196
5197 bool
5198 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5199                             bool masked_p)
5200 {
5201   if (masked_p)
5202     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5203                                          vec_mask_store_lanes_optab,
5204                                          vectype, count);
5205   else
5206     return vect_lanes_optab_supported_p ("vec_store_lanes",
5207                                          vec_store_lanes_optab,
5208                                          vectype, count);
5209 }
5210
5211
5212 /* Function vect_permute_store_chain.
5213
5214    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5215    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5216    the data correctly for the stores.  Return the final references for stores
5217    in RESULT_CHAIN.
5218
5219    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5220    The input is 4 vectors each containing 8 elements.  We assign a number to
5221    each element, the input sequence is:
5222
5223    1st vec:   0  1  2  3  4  5  6  7
5224    2nd vec:   8  9 10 11 12 13 14 15
5225    3rd vec:  16 17 18 19 20 21 22 23
5226    4th vec:  24 25 26 27 28 29 30 31
5227
5228    The output sequence should be:
5229
5230    1st vec:  0  8 16 24  1  9 17 25
5231    2nd vec:  2 10 18 26  3 11 19 27
5232    3rd vec:  4 12 20 28  5 13 21 30
5233    4th vec:  6 14 22 30  7 15 23 31
5234
5235    i.e., we interleave the contents of the four vectors in their order.
5236
5237    We use interleave_high/low instructions to create such output.  The input of
5238    each interleave_high/low operation is two vectors:
5239    1st vec    2nd vec
5240    0 1 2 3    4 5 6 7
5241    the even elements of the result vector are obtained left-to-right from the
5242    high/low elements of the first vector.  The odd elements of the result are
5243    obtained left-to-right from the high/low elements of the second vector.
5244    The output of interleave_high will be:   0 4 1 5
5245    and of interleave_low:                   2 6 3 7
5246
5247
5248    The permutation is done in log LENGTH stages.  In each stage interleave_high
5249    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5250    where the first argument is taken from the first half of DR_CHAIN and the
5251    second argument from it's second half.
5252    In our example,
5253
5254    I1: interleave_high (1st vec, 3rd vec)
5255    I2: interleave_low (1st vec, 3rd vec)
5256    I3: interleave_high (2nd vec, 4th vec)
5257    I4: interleave_low (2nd vec, 4th vec)
5258
5259    The output for the first stage is:
5260
5261    I1:  0 16  1 17  2 18  3 19
5262    I2:  4 20  5 21  6 22  7 23
5263    I3:  8 24  9 25 10 26 11 27
5264    I4: 12 28 13 29 14 30 15 31
5265
5266    The output of the second stage, i.e. the final result is:
5267
5268    I1:  0  8 16 24  1  9 17 25
5269    I2:  2 10 18 26  3 11 19 27
5270    I3:  4 12 20 28  5 13 21 30
5271    I4:  6 14 22 30  7 15 23 31.  */
5272
5273 void
5274 vect_permute_store_chain (vec<tree> dr_chain,
5275                           unsigned int length,
5276                           gimple *stmt,
5277                           gimple_stmt_iterator *gsi,
5278                           vec<tree> *result_chain)
5279 {
5280   tree vect1, vect2, high, low;
5281   gimple *perm_stmt;
5282   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5283   tree perm_mask_low, perm_mask_high;
5284   tree data_ref;
5285   tree perm3_mask_low, perm3_mask_high;
5286   unsigned int i, j, n, log_length = exact_log2 (length);
5287
5288   result_chain->quick_grow (length);
5289   memcpy (result_chain->address (), dr_chain.address (),
5290           length * sizeof (tree));
5291
5292   if (length == 3)
5293     {
5294       /* vect_grouped_store_supported ensures that this is constant.  */
5295       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5296       unsigned int j0 = 0, j1 = 0, j2 = 0;
5297
5298       vec_perm_builder sel (nelt, nelt, 1);
5299       sel.quick_grow (nelt);
5300       vec_perm_indices indices;
5301       for (j = 0; j < 3; j++)
5302         {
5303           int nelt0 = ((3 - j) * nelt) % 3;
5304           int nelt1 = ((3 - j) * nelt + 1) % 3;
5305           int nelt2 = ((3 - j) * nelt + 2) % 3;
5306
5307           for (i = 0; i < nelt; i++)
5308             {
5309               if (3 * i + nelt0 < nelt)
5310                 sel[3 * i + nelt0] = j0++;
5311               if (3 * i + nelt1 < nelt)
5312                 sel[3 * i + nelt1] = nelt + j1++;
5313               if (3 * i + nelt2 < nelt)
5314                 sel[3 * i + nelt2] = 0;
5315             }
5316           indices.new_vector (sel, 2, nelt);
5317           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5318
5319           for (i = 0; i < nelt; i++)
5320             {
5321               if (3 * i + nelt0 < nelt)
5322                 sel[3 * i + nelt0] = 3 * i + nelt0;
5323               if (3 * i + nelt1 < nelt)
5324                 sel[3 * i + nelt1] = 3 * i + nelt1;
5325               if (3 * i + nelt2 < nelt)
5326                 sel[3 * i + nelt2] = nelt + j2++;
5327             }
5328           indices.new_vector (sel, 2, nelt);
5329           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5330
5331           vect1 = dr_chain[0];
5332           vect2 = dr_chain[1];
5333
5334           /* Create interleaving stmt:
5335              low = VEC_PERM_EXPR <vect1, vect2,
5336                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5337                                    j + 2, nelt + j + 2, *, ...}>  */
5338           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5339           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5340                                            vect2, perm3_mask_low);
5341           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342
5343           vect1 = data_ref;
5344           vect2 = dr_chain[2];
5345           /* Create interleaving stmt:
5346              low = VEC_PERM_EXPR <vect1, vect2,
5347                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5348                                    6, 7, nelt + j + 2, ...}>  */
5349           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5350           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5351                                            vect2, perm3_mask_high);
5352           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5353           (*result_chain)[j] = data_ref;
5354         }
5355     }
5356   else
5357     {
5358       /* If length is not equal to 3 then only power of 2 is supported.  */
5359       gcc_assert (pow2p_hwi (length));
5360
5361       /* The encoding has 2 interleaved stepped patterns.  */
5362       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5363       vec_perm_builder sel (nelt, 2, 3);
5364       sel.quick_grow (6);
5365       for (i = 0; i < 3; i++)
5366         {
5367           sel[i * 2] = i;
5368           sel[i * 2 + 1] = i + nelt;
5369         }
5370         vec_perm_indices indices (sel, 2, nelt);
5371         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5372
5373         for (i = 0; i < 6; i++)
5374           sel[i] += exact_div (nelt, 2);
5375         indices.new_vector (sel, 2, nelt);
5376         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5377
5378         for (i = 0, n = log_length; i < n; i++)
5379           {
5380             for (j = 0; j < length/2; j++)
5381               {
5382                 vect1 = dr_chain[j];
5383                 vect2 = dr_chain[j+length/2];
5384
5385                 /* Create interleaving stmt:
5386                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5387                                                         ...}>  */
5388                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5389                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5390                                                  vect2, perm_mask_high);
5391                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392                 (*result_chain)[2*j] = high;
5393
5394                 /* Create interleaving stmt:
5395                    low = VEC_PERM_EXPR <vect1, vect2,
5396                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5397                                          ...}>  */
5398                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5399                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5400                                                  vect2, perm_mask_low);
5401                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5402                 (*result_chain)[2*j+1] = low;
5403               }
5404             memcpy (dr_chain.address (), result_chain->address (),
5405                     length * sizeof (tree));
5406           }
5407     }
5408 }
5409
5410 /* Function vect_setup_realignment
5411
5412    This function is called when vectorizing an unaligned load using
5413    the dr_explicit_realign[_optimized] scheme.
5414    This function generates the following code at the loop prolog:
5415
5416       p = initial_addr;
5417    x  msq_init = *(floor(p));   # prolog load
5418       realignment_token = call target_builtin;
5419     loop:
5420    x  msq = phi (msq_init, ---)
5421
5422    The stmts marked with x are generated only for the case of
5423    dr_explicit_realign_optimized.
5424
5425    The code above sets up a new (vector) pointer, pointing to the first
5426    location accessed by STMT, and a "floor-aligned" load using that pointer.
5427    It also generates code to compute the "realignment-token" (if the relevant
5428    target hook was defined), and creates a phi-node at the loop-header bb
5429    whose arguments are the result of the prolog-load (created by this
5430    function) and the result of a load that takes place in the loop (to be
5431    created by the caller to this function).
5432
5433    For the case of dr_explicit_realign_optimized:
5434    The caller to this function uses the phi-result (msq) to create the
5435    realignment code inside the loop, and sets up the missing phi argument,
5436    as follows:
5437     loop:
5438       msq = phi (msq_init, lsq)
5439       lsq = *(floor(p'));        # load in loop
5440       result = realign_load (msq, lsq, realignment_token);
5441
5442    For the case of dr_explicit_realign:
5443     loop:
5444       msq = *(floor(p));        # load in loop
5445       p' = p + (VS-1);
5446       lsq = *(floor(p'));       # load in loop
5447       result = realign_load (msq, lsq, realignment_token);
5448
5449    Input:
5450    STMT - (scalar) load stmt to be vectorized. This load accesses
5451           a memory location that may be unaligned.
5452    BSI - place where new code is to be inserted.
5453    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5454                               is used.
5455
5456    Output:
5457    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5458                        target hook, if defined.
5459    Return value - the result of the loop-header phi node.  */
5460
5461 tree
5462 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5463                         tree *realignment_token,
5464                         enum dr_alignment_support alignment_support_scheme,
5465                         tree init_addr,
5466                         struct loop **at_loop)
5467 {
5468   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5469   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5470   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5471   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5472   struct loop *loop = NULL;
5473   edge pe = NULL;
5474   tree scalar_dest = gimple_assign_lhs (stmt);
5475   tree vec_dest;
5476   gimple *inc;
5477   tree ptr;
5478   tree data_ref;
5479   basic_block new_bb;
5480   tree msq_init = NULL_TREE;
5481   tree new_temp;
5482   gphi *phi_stmt;
5483   tree msq = NULL_TREE;
5484   gimple_seq stmts = NULL;
5485   bool inv_p;
5486   bool compute_in_loop = false;
5487   bool nested_in_vect_loop = false;
5488   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5489   struct loop *loop_for_initial_load = NULL;
5490
5491   if (loop_vinfo)
5492     {
5493       loop = LOOP_VINFO_LOOP (loop_vinfo);
5494       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5495     }
5496
5497   gcc_assert (alignment_support_scheme == dr_explicit_realign
5498               || alignment_support_scheme == dr_explicit_realign_optimized);
5499
5500   /* We need to generate three things:
5501      1. the misalignment computation
5502      2. the extra vector load (for the optimized realignment scheme).
5503      3. the phi node for the two vectors from which the realignment is
5504       done (for the optimized realignment scheme).  */
5505
5506   /* 1. Determine where to generate the misalignment computation.
5507
5508      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5509      calculation will be generated by this function, outside the loop (in the
5510      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5511      caller, inside the loop.
5512
5513      Background: If the misalignment remains fixed throughout the iterations of
5514      the loop, then both realignment schemes are applicable, and also the
5515      misalignment computation can be done outside LOOP.  This is because we are
5516      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5517      are a multiple of VS (the Vector Size), and therefore the misalignment in
5518      different vectorized LOOP iterations is always the same.
5519      The problem arises only if the memory access is in an inner-loop nested
5520      inside LOOP, which is now being vectorized using outer-loop vectorization.
5521      This is the only case when the misalignment of the memory access may not
5522      remain fixed throughout the iterations of the inner-loop (as explained in
5523      detail in vect_supportable_dr_alignment).  In this case, not only is the
5524      optimized realignment scheme not applicable, but also the misalignment
5525      computation (and generation of the realignment token that is passed to
5526      REALIGN_LOAD) have to be done inside the loop.
5527
5528      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5529      or not, which in turn determines if the misalignment is computed inside
5530      the inner-loop, or outside LOOP.  */
5531
5532   if (init_addr != NULL_TREE || !loop_vinfo)
5533     {
5534       compute_in_loop = true;
5535       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5536     }
5537
5538
5539   /* 2. Determine where to generate the extra vector load.
5540
5541      For the optimized realignment scheme, instead of generating two vector
5542      loads in each iteration, we generate a single extra vector load in the
5543      preheader of the loop, and in each iteration reuse the result of the
5544      vector load from the previous iteration.  In case the memory access is in
5545      an inner-loop nested inside LOOP, which is now being vectorized using
5546      outer-loop vectorization, we need to determine whether this initial vector
5547      load should be generated at the preheader of the inner-loop, or can be
5548      generated at the preheader of LOOP.  If the memory access has no evolution
5549      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5550      to be generated inside LOOP (in the preheader of the inner-loop).  */
5551
5552   if (nested_in_vect_loop)
5553     {
5554       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5555       bool invariant_in_outerloop =
5556             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5557       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5558     }
5559   else
5560     loop_for_initial_load = loop;
5561   if (at_loop)
5562     *at_loop = loop_for_initial_load;
5563
5564   if (loop_for_initial_load)
5565     pe = loop_preheader_edge (loop_for_initial_load);
5566
5567   /* 3. For the case of the optimized realignment, create the first vector
5568       load at the loop preheader.  */
5569
5570   if (alignment_support_scheme == dr_explicit_realign_optimized)
5571     {
5572       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5573       gassign *new_stmt;
5574
5575       gcc_assert (!compute_in_loop);
5576       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5577       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5578                                       NULL_TREE, &init_addr, NULL, &inc,
5579                                       true, &inv_p);
5580       if (TREE_CODE (ptr) == SSA_NAME)
5581         new_temp = copy_ssa_name (ptr);
5582       else
5583         new_temp = make_ssa_name (TREE_TYPE (ptr));
5584       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5585       new_stmt = gimple_build_assign
5586                    (new_temp, BIT_AND_EXPR, ptr,
5587                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5588       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5589       gcc_assert (!new_bb);
5590       data_ref
5591         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5592                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5593       vect_copy_ref_info (data_ref, DR_REF (dr));
5594       new_stmt = gimple_build_assign (vec_dest, data_ref);
5595       new_temp = make_ssa_name (vec_dest, new_stmt);
5596       gimple_assign_set_lhs (new_stmt, new_temp);
5597       if (pe)
5598         {
5599           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5600           gcc_assert (!new_bb);
5601         }
5602       else
5603          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5604
5605       msq_init = gimple_assign_lhs (new_stmt);
5606     }
5607
5608   /* 4. Create realignment token using a target builtin, if available.
5609       It is done either inside the containing loop, or before LOOP (as
5610       determined above).  */
5611
5612   if (targetm.vectorize.builtin_mask_for_load)
5613     {
5614       gcall *new_stmt;
5615       tree builtin_decl;
5616
5617       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5618       if (!init_addr)
5619         {
5620           /* Generate the INIT_ADDR computation outside LOOP.  */
5621           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5622                                                             NULL_TREE);
5623           if (loop)
5624             {
5625               pe = loop_preheader_edge (loop);
5626               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5627               gcc_assert (!new_bb);
5628             }
5629           else
5630              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5631         }
5632
5633       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5634       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5635       vec_dest =
5636         vect_create_destination_var (scalar_dest,
5637                                      gimple_call_return_type (new_stmt));
5638       new_temp = make_ssa_name (vec_dest, new_stmt);
5639       gimple_call_set_lhs (new_stmt, new_temp);
5640
5641       if (compute_in_loop)
5642         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5643       else
5644         {
5645           /* Generate the misalignment computation outside LOOP.  */
5646           pe = loop_preheader_edge (loop);
5647           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5648           gcc_assert (!new_bb);
5649         }
5650
5651       *realignment_token = gimple_call_lhs (new_stmt);
5652
5653       /* The result of the CALL_EXPR to this builtin is determined from
5654          the value of the parameter and no global variables are touched
5655          which makes the builtin a "const" function.  Requiring the
5656          builtin to have the "const" attribute makes it unnecessary
5657          to call mark_call_clobbered.  */
5658       gcc_assert (TREE_READONLY (builtin_decl));
5659     }
5660
5661   if (alignment_support_scheme == dr_explicit_realign)
5662     return msq;
5663
5664   gcc_assert (!compute_in_loop);
5665   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5666
5667
5668   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5669
5670   pe = loop_preheader_edge (containing_loop);
5671   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5672   msq = make_ssa_name (vec_dest);
5673   phi_stmt = create_phi_node (msq, containing_loop->header);
5674   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5675
5676   return msq;
5677 }
5678
5679
5680 /* Function vect_grouped_load_supported.
5681
5682    COUNT is the size of the load group (the number of statements plus the
5683    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5684    only one statement, with a gap of COUNT - 1.
5685
5686    Returns true if a suitable permute exists.  */
5687
5688 bool
5689 vect_grouped_load_supported (tree vectype, bool single_element_p,
5690                              unsigned HOST_WIDE_INT count)
5691 {
5692   machine_mode mode = TYPE_MODE (vectype);
5693
5694   /* If this is single-element interleaving with an element distance
5695      that leaves unused vector loads around punt - we at least create
5696      very sub-optimal code in that case (and blow up memory,
5697      see PR65518).  */
5698   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5699     {
5700       if (dump_enabled_p ())
5701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702                          "single-element interleaving not supported "
5703                          "for not adjacent vector loads\n");
5704       return false;
5705     }
5706
5707   /* vect_permute_load_chain requires the group size to be equal to 3 or
5708      be a power of two.  */
5709   if (count != 3 && exact_log2 (count) == -1)
5710     {
5711       if (dump_enabled_p ())
5712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5713                          "the size of the group of accesses"
5714                          " is not a power of 2 or not equal to 3\n");
5715       return false;
5716     }
5717
5718   /* Check that the permutation is supported.  */
5719   if (VECTOR_MODE_P (mode))
5720     {
5721       unsigned int i, j;
5722       if (count == 3)
5723         {
5724           unsigned int nelt;
5725           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5726             {
5727               if (dump_enabled_p ())
5728                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5729                                  "cannot handle groups of 3 loads for"
5730                                  " variable-length vectors\n");
5731               return false;
5732             }
5733
5734           vec_perm_builder sel (nelt, nelt, 1);
5735           sel.quick_grow (nelt);
5736           vec_perm_indices indices;
5737           unsigned int k;
5738           for (k = 0; k < 3; k++)
5739             {
5740               for (i = 0; i < nelt; i++)
5741                 if (3 * i + k < 2 * nelt)
5742                   sel[i] = 3 * i + k;
5743                 else
5744                   sel[i] = 0;
5745               indices.new_vector (sel, 2, nelt);
5746               if (!can_vec_perm_const_p (mode, indices))
5747                 {
5748                   if (dump_enabled_p ())
5749                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5750                                      "shuffle of 3 loads is not supported by"
5751                                      " target\n");
5752                   return false;
5753                 }
5754               for (i = 0, j = 0; i < nelt; i++)
5755                 if (3 * i + k < 2 * nelt)
5756                   sel[i] = i;
5757                 else
5758                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5759               indices.new_vector (sel, 2, nelt);
5760               if (!can_vec_perm_const_p (mode, indices))
5761                 {
5762                   if (dump_enabled_p ())
5763                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764                                      "shuffle of 3 loads is not supported by"
5765                                      " target\n");
5766                   return false;
5767                 }
5768             }
5769           return true;
5770         }
5771       else
5772         {
5773           /* If length is not equal to 3 then only power of 2 is supported.  */
5774           gcc_assert (pow2p_hwi (count));
5775           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5776
5777           /* The encoding has a single stepped pattern.  */
5778           vec_perm_builder sel (nelt, 1, 3);
5779           sel.quick_grow (3);
5780           for (i = 0; i < 3; i++)
5781             sel[i] = i * 2;
5782           vec_perm_indices indices (sel, 2, nelt);
5783           if (can_vec_perm_const_p (mode, indices))
5784             {
5785               for (i = 0; i < 3; i++)
5786                 sel[i] = i * 2 + 1;
5787               indices.new_vector (sel, 2, nelt);
5788               if (can_vec_perm_const_p (mode, indices))
5789                 return true;
5790             }
5791         }
5792     }
5793
5794   if (dump_enabled_p ())
5795     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5796                      "extract even/odd not supported by target\n");
5797   return false;
5798 }
5799
5800 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5801    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5802
5803 bool
5804 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5805                            bool masked_p)
5806 {
5807   if (masked_p)
5808     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5809                                          vec_mask_load_lanes_optab,
5810                                          vectype, count);
5811   else
5812     return vect_lanes_optab_supported_p ("vec_load_lanes",
5813                                          vec_load_lanes_optab,
5814                                          vectype, count);
5815 }
5816
5817 /* Function vect_permute_load_chain.
5818
5819    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5820    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5821    the input data correctly.  Return the final references for loads in
5822    RESULT_CHAIN.
5823
5824    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5825    The input is 4 vectors each containing 8 elements. We assign a number to each
5826    element, the input sequence is:
5827
5828    1st vec:   0  1  2  3  4  5  6  7
5829    2nd vec:   8  9 10 11 12 13 14 15
5830    3rd vec:  16 17 18 19 20 21 22 23
5831    4th vec:  24 25 26 27 28 29 30 31
5832
5833    The output sequence should be:
5834
5835    1st vec:  0 4  8 12 16 20 24 28
5836    2nd vec:  1 5  9 13 17 21 25 29
5837    3rd vec:  2 6 10 14 18 22 26 30
5838    4th vec:  3 7 11 15 19 23 27 31
5839
5840    i.e., the first output vector should contain the first elements of each
5841    interleaving group, etc.
5842
5843    We use extract_even/odd instructions to create such output.  The input of
5844    each extract_even/odd operation is two vectors
5845    1st vec    2nd vec
5846    0 1 2 3    4 5 6 7
5847
5848    and the output is the vector of extracted even/odd elements.  The output of
5849    extract_even will be:   0 2 4 6
5850    and of extract_odd:     1 3 5 7
5851
5852
5853    The permutation is done in log LENGTH stages.  In each stage extract_even
5854    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5855    their order.  In our example,
5856
5857    E1: extract_even (1st vec, 2nd vec)
5858    E2: extract_odd (1st vec, 2nd vec)
5859    E3: extract_even (3rd vec, 4th vec)
5860    E4: extract_odd (3rd vec, 4th vec)
5861
5862    The output for the first stage will be:
5863
5864    E1:  0  2  4  6  8 10 12 14
5865    E2:  1  3  5  7  9 11 13 15
5866    E3: 16 18 20 22 24 26 28 30
5867    E4: 17 19 21 23 25 27 29 31
5868
5869    In order to proceed and create the correct sequence for the next stage (or
5870    for the correct output, if the second stage is the last one, as in our
5871    example), we first put the output of extract_even operation and then the
5872    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5873    The input for the second stage is:
5874
5875    1st vec (E1):  0  2  4  6  8 10 12 14
5876    2nd vec (E3): 16 18 20 22 24 26 28 30
5877    3rd vec (E2):  1  3  5  7  9 11 13 15
5878    4th vec (E4): 17 19 21 23 25 27 29 31
5879
5880    The output of the second stage:
5881
5882    E1: 0 4  8 12 16 20 24 28
5883    E2: 2 6 10 14 18 22 26 30
5884    E3: 1 5  9 13 17 21 25 29
5885    E4: 3 7 11 15 19 23 27 31
5886
5887    And RESULT_CHAIN after reordering:
5888
5889    1st vec (E1):  0 4  8 12 16 20 24 28
5890    2nd vec (E3):  1 5  9 13 17 21 25 29
5891    3rd vec (E2):  2 6 10 14 18 22 26 30
5892    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5893
5894 static void
5895 vect_permute_load_chain (vec<tree> dr_chain,
5896                          unsigned int length,
5897                          gimple *stmt,
5898                          gimple_stmt_iterator *gsi,
5899                          vec<tree> *result_chain)
5900 {
5901   tree data_ref, first_vect, second_vect;
5902   tree perm_mask_even, perm_mask_odd;
5903   tree perm3_mask_low, perm3_mask_high;
5904   gimple *perm_stmt;
5905   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5906   unsigned int i, j, log_length = exact_log2 (length);
5907
5908   result_chain->quick_grow (length);
5909   memcpy (result_chain->address (), dr_chain.address (),
5910           length * sizeof (tree));
5911
5912   if (length == 3)
5913     {
5914       /* vect_grouped_load_supported ensures that this is constant.  */
5915       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5916       unsigned int k;
5917
5918       vec_perm_builder sel (nelt, nelt, 1);
5919       sel.quick_grow (nelt);
5920       vec_perm_indices indices;
5921       for (k = 0; k < 3; k++)
5922         {
5923           for (i = 0; i < nelt; i++)
5924             if (3 * i + k < 2 * nelt)
5925               sel[i] = 3 * i + k;
5926             else
5927               sel[i] = 0;
5928           indices.new_vector (sel, 2, nelt);
5929           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5930
5931           for (i = 0, j = 0; i < nelt; i++)
5932             if (3 * i + k < 2 * nelt)
5933               sel[i] = i;
5934             else
5935               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5936           indices.new_vector (sel, 2, nelt);
5937           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5938
5939           first_vect = dr_chain[0];
5940           second_vect = dr_chain[1];
5941
5942           /* Create interleaving stmt (low part of):
5943              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5944                                                              ...}>  */
5945           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5946           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5947                                            second_vect, perm3_mask_low);
5948           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5949
5950           /* Create interleaving stmt (high part of):
5951              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5952                                                               ...}>  */
5953           first_vect = data_ref;
5954           second_vect = dr_chain[2];
5955           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5956           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5957                                            second_vect, perm3_mask_high);
5958           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5959           (*result_chain)[k] = data_ref;
5960         }
5961     }
5962   else
5963     {
5964       /* If length is not equal to 3 then only power of 2 is supported.  */
5965       gcc_assert (pow2p_hwi (length));
5966
5967       /* The encoding has a single stepped pattern.  */
5968       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5969       vec_perm_builder sel (nelt, 1, 3);
5970       sel.quick_grow (3);
5971       for (i = 0; i < 3; ++i)
5972         sel[i] = i * 2;
5973       vec_perm_indices indices (sel, 2, nelt);
5974       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5975
5976       for (i = 0; i < 3; ++i)
5977         sel[i] = i * 2 + 1;
5978       indices.new_vector (sel, 2, nelt);
5979       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5980
5981       for (i = 0; i < log_length; i++)
5982         {
5983           for (j = 0; j < length; j += 2)
5984             {
5985               first_vect = dr_chain[j];
5986               second_vect = dr_chain[j+1];
5987
5988               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5989               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5990               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5991                                                first_vect, second_vect,
5992                                                perm_mask_even);
5993               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5994               (*result_chain)[j/2] = data_ref;
5995
5996               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5997               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5998               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5999                                                first_vect, second_vect,
6000                                                perm_mask_odd);
6001               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6002               (*result_chain)[j/2+length/2] = data_ref;
6003             }
6004           memcpy (dr_chain.address (), result_chain->address (),
6005                   length * sizeof (tree));
6006         }
6007     }
6008 }
6009
6010 /* Function vect_shift_permute_load_chain.
6011
6012    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6013    sequence of stmts to reorder the input data accordingly.
6014    Return the final references for loads in RESULT_CHAIN.
6015    Return true if successed, false otherwise.
6016
6017    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6018    The input is 3 vectors each containing 8 elements.  We assign a
6019    number to each element, the input sequence is:
6020
6021    1st vec:   0  1  2  3  4  5  6  7
6022    2nd vec:   8  9 10 11 12 13 14 15
6023    3rd vec:  16 17 18 19 20 21 22 23
6024
6025    The output sequence should be:
6026
6027    1st vec:  0 3 6  9 12 15 18 21
6028    2nd vec:  1 4 7 10 13 16 19 22
6029    3rd vec:  2 5 8 11 14 17 20 23
6030
6031    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6032
6033    First we shuffle all 3 vectors to get correct elements order:
6034
6035    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6036    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6037    3rd vec:  (16 19 22) (17 20 23) (18 21)
6038
6039    Next we unite and shift vector 3 times:
6040
6041    1st step:
6042      shift right by 6 the concatenation of:
6043      "1st vec" and  "2nd vec"
6044        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6045      "2nd vec" and  "3rd vec"
6046        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6047      "3rd vec" and  "1st vec"
6048        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6049                              | New vectors                   |
6050
6051      So that now new vectors are:
6052
6053      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6054      2nd vec:  (10 13) (16 19 22) (17 20 23)
6055      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6056
6057    2nd step:
6058      shift right by 5 the concatenation of:
6059      "1st vec" and  "3rd vec"
6060        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6061      "2nd vec" and  "1st vec"
6062        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6063      "3rd vec" and  "2nd vec"
6064        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6065                           | New vectors                   |
6066
6067      So that now new vectors are:
6068
6069      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6070      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6071      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6072
6073    3rd step:
6074      shift right by 5 the concatenation of:
6075      "1st vec" and  "1st vec"
6076        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6077      shift right by 3 the concatenation of:
6078      "2nd vec" and  "2nd vec"
6079                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6080                           | New vectors                   |
6081
6082      So that now all vectors are READY:
6083      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6084      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6085      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6086
6087    This algorithm is faster than one in vect_permute_load_chain if:
6088      1.  "shift of a concatination" is faster than general permutation.
6089          This is usually so.
6090      2.  The TARGET machine can't execute vector instructions in parallel.
6091          This is because each step of the algorithm depends on previous.
6092          The algorithm in vect_permute_load_chain is much more parallel.
6093
6094    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6095 */
6096
6097 static bool
6098 vect_shift_permute_load_chain (vec<tree> dr_chain,
6099                                unsigned int length,
6100                                gimple *stmt,
6101                                gimple_stmt_iterator *gsi,
6102                                vec<tree> *result_chain)
6103 {
6104   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6105   tree perm2_mask1, perm2_mask2, perm3_mask;
6106   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6107   gimple *perm_stmt;
6108
6109   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6110   unsigned int i;
6111   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6112   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6113
6114   unsigned HOST_WIDE_INT nelt, vf;
6115   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6116       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6117     /* Not supported for variable-length vectors.  */
6118     return false;
6119
6120   vec_perm_builder sel (nelt, nelt, 1);
6121   sel.quick_grow (nelt);
6122
6123   result_chain->quick_grow (length);
6124   memcpy (result_chain->address (), dr_chain.address (),
6125           length * sizeof (tree));
6126
6127   if (pow2p_hwi (length) && vf > 4)
6128     {
6129       unsigned int j, log_length = exact_log2 (length);
6130       for (i = 0; i < nelt / 2; ++i)
6131         sel[i] = i * 2;
6132       for (i = 0; i < nelt / 2; ++i)
6133         sel[nelt / 2 + i] = i * 2 + 1;
6134       vec_perm_indices indices (sel, 2, nelt);
6135       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6136         {
6137           if (dump_enabled_p ())
6138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139                              "shuffle of 2 fields structure is not \
6140                               supported by target\n");
6141           return false;
6142         }
6143       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6144
6145       for (i = 0; i < nelt / 2; ++i)
6146         sel[i] = i * 2 + 1;
6147       for (i = 0; i < nelt / 2; ++i)
6148         sel[nelt / 2 + i] = i * 2;
6149       indices.new_vector (sel, 2, nelt);
6150       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6151         {
6152           if (dump_enabled_p ())
6153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6154                              "shuffle of 2 fields structure is not \
6155                               supported by target\n");
6156           return false;
6157         }
6158       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6159
6160       /* Generating permutation constant to shift all elements.
6161          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6162       for (i = 0; i < nelt; i++)
6163         sel[i] = nelt / 2 + i;
6164       indices.new_vector (sel, 2, nelt);
6165       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6166         {
6167           if (dump_enabled_p ())
6168             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169                              "shift permutation is not supported by target\n");
6170           return false;
6171         }
6172       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6173
6174       /* Generating permutation constant to select vector from 2.
6175          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6176       for (i = 0; i < nelt / 2; i++)
6177         sel[i] = i;
6178       for (i = nelt / 2; i < nelt; i++)
6179         sel[i] = nelt + i;
6180       indices.new_vector (sel, 2, nelt);
6181       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6182         {
6183           if (dump_enabled_p ())
6184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6185                              "select is not supported by target\n");
6186           return false;
6187         }
6188       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6189
6190       for (i = 0; i < log_length; i++)
6191         {
6192           for (j = 0; j < length; j += 2)
6193             {
6194               first_vect = dr_chain[j];
6195               second_vect = dr_chain[j + 1];
6196
6197               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6198               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6199                                                first_vect, first_vect,
6200                                                perm2_mask1);
6201               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6202               vect[0] = data_ref;
6203
6204               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6205               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6206                                                second_vect, second_vect,
6207                                                perm2_mask2);
6208               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6209               vect[1] = data_ref;
6210
6211               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6212               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6213                                                vect[0], vect[1], shift1_mask);
6214               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6215               (*result_chain)[j/2 + length/2] = data_ref;
6216
6217               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6218               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219                                                vect[0], vect[1], select_mask);
6220               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6221               (*result_chain)[j/2] = data_ref;
6222             }
6223           memcpy (dr_chain.address (), result_chain->address (),
6224                   length * sizeof (tree));
6225         }
6226       return true;
6227     }
6228   if (length == 3 && vf > 2)
6229     {
6230       unsigned int k = 0, l = 0;
6231
6232       /* Generating permutation constant to get all elements in rigth order.
6233          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6234       for (i = 0; i < nelt; i++)
6235         {
6236           if (3 * k + (l % 3) >= nelt)
6237             {
6238               k = 0;
6239               l += (3 - (nelt % 3));
6240             }
6241           sel[i] = 3 * k + (l % 3);
6242           k++;
6243         }
6244       vec_perm_indices indices (sel, 2, nelt);
6245       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6246         {
6247           if (dump_enabled_p ())
6248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249                              "shuffle of 3 fields structure is not \
6250                               supported by target\n");
6251           return false;
6252         }
6253       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6254
6255       /* Generating permutation constant to shift all elements.
6256          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6257       for (i = 0; i < nelt; i++)
6258         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6259       indices.new_vector (sel, 2, nelt);
6260       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6261         {
6262           if (dump_enabled_p ())
6263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6264                              "shift permutation is not supported by target\n");
6265           return false;
6266         }
6267       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6268
6269       /* Generating permutation constant to shift all elements.
6270          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6271       for (i = 0; i < nelt; i++)
6272         sel[i] = 2 * (nelt / 3) + 1 + i;
6273       indices.new_vector (sel, 2, nelt);
6274       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6275         {
6276           if (dump_enabled_p ())
6277             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6278                              "shift permutation is not supported by target\n");
6279           return false;
6280         }
6281       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6282
6283       /* Generating permutation constant to shift all elements.
6284          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6285       for (i = 0; i < nelt; i++)
6286         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6287       indices.new_vector (sel, 2, nelt);
6288       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6289         {
6290           if (dump_enabled_p ())
6291             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6292                              "shift permutation is not supported by target\n");
6293           return false;
6294         }
6295       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6296
6297       /* Generating permutation constant to shift all elements.
6298          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6299       for (i = 0; i < nelt; i++)
6300         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6301       indices.new_vector (sel, 2, nelt);
6302       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6303         {
6304           if (dump_enabled_p ())
6305             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6306                              "shift permutation is not supported by target\n");
6307           return false;
6308         }
6309       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6310
6311       for (k = 0; k < 3; k++)
6312         {
6313           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6314           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6315                                            dr_chain[k], dr_chain[k],
6316                                            perm3_mask);
6317           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6318           vect[k] = data_ref;
6319         }
6320
6321       for (k = 0; k < 3; k++)
6322         {
6323           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6324           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6325                                            vect[k % 3], vect[(k + 1) % 3],
6326                                            shift1_mask);
6327           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6328           vect_shift[k] = data_ref;
6329         }
6330
6331       for (k = 0; k < 3; k++)
6332         {
6333           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6334           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6335                                            vect_shift[(4 - k) % 3],
6336                                            vect_shift[(3 - k) % 3],
6337                                            shift2_mask);
6338           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6339           vect[k] = data_ref;
6340         }
6341
6342       (*result_chain)[3 - (nelt % 3)] = vect[2];
6343
6344       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6345       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6346                                        vect[0], shift3_mask);
6347       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6348       (*result_chain)[nelt % 3] = data_ref;
6349
6350       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6351       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6352                                        vect[1], shift4_mask);
6353       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6354       (*result_chain)[0] = data_ref;
6355       return true;
6356     }
6357   return false;
6358 }
6359
6360 /* Function vect_transform_grouped_load.
6361
6362    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6363    to perform their permutation and ascribe the result vectorized statements to
6364    the scalar statements.
6365 */
6366
6367 void
6368 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6369                              gimple_stmt_iterator *gsi)
6370 {
6371   machine_mode mode;
6372   vec<tree> result_chain = vNULL;
6373
6374   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6375      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6376      vectors, that are ready for vector computation.  */
6377   result_chain.create (size);
6378
6379   /* If reassociation width for vector type is 2 or greater target machine can
6380      execute 2 or more vector instructions in parallel.  Otherwise try to
6381      get chain for loads group using vect_shift_permute_load_chain.  */
6382   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6383   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6384       || pow2p_hwi (size)
6385       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6386                                          gsi, &result_chain))
6387     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6388   vect_record_grouped_load_vectors (stmt, result_chain);
6389   result_chain.release ();
6390 }
6391
6392 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6393    generated as part of the vectorization of STMT.  Assign the statement
6394    for each vector to the associated scalar statement.  */
6395
6396 void
6397 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6398 {
6399   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6400   gimple *next_stmt, *new_stmt;
6401   unsigned int i, gap_count;
6402   tree tmp_data_ref;
6403
6404   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6405      Since we scan the chain starting from it's first node, their order
6406      corresponds the order of data-refs in RESULT_CHAIN.  */
6407   next_stmt = first_stmt;
6408   gap_count = 1;
6409   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6410     {
6411       if (!next_stmt)
6412         break;
6413
6414       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6415        code elimination pass later.  No need to check for the first stmt in
6416        the group, since it always exists.
6417        DR_GROUP_GAP is the number of steps in elements from the previous
6418        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6419        correspond to the gaps.  */
6420       if (next_stmt != first_stmt
6421           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6422       {
6423         gap_count++;
6424         continue;
6425       }
6426
6427       while (next_stmt)
6428         {
6429           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6430           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6431              copies, and we put the new vector statement in the first available
6432              RELATED_STMT.  */
6433           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6434             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6435           else
6436             {
6437               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6438                 {
6439                   gimple *prev_stmt =
6440                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6441                   gimple *rel_stmt =
6442                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6443                   while (rel_stmt)
6444                     {
6445                       prev_stmt = rel_stmt;
6446                       rel_stmt =
6447                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6448                     }
6449
6450                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6451                     new_stmt;
6452                 }
6453             }
6454
6455           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6456           gap_count = 1;
6457           /* If NEXT_STMT accesses the same DR as the previous statement,
6458              put the same TMP_DATA_REF as its vectorized statement; otherwise
6459              get the next data-ref from RESULT_CHAIN.  */
6460           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6461             break;
6462         }
6463     }
6464 }
6465
6466 /* Function vect_force_dr_alignment_p.
6467
6468    Returns whether the alignment of a DECL can be forced to be aligned
6469    on ALIGNMENT bit boundary.  */
6470
6471 bool
6472 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6473 {
6474   if (!VAR_P (decl))
6475     return false;
6476
6477   if (decl_in_symtab_p (decl)
6478       && !symtab_node::get (decl)->can_increase_alignment_p ())
6479     return false;
6480
6481   if (TREE_STATIC (decl))
6482     return (alignment <= MAX_OFILE_ALIGNMENT);
6483   else
6484     return (alignment <= MAX_STACK_ALIGNMENT);
6485 }
6486
6487
6488 /* Return whether the data reference DR is supported with respect to its
6489    alignment.
6490    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6491    it is aligned, i.e., check if it is possible to vectorize it with different
6492    alignment.  */
6493
6494 enum dr_alignment_support
6495 vect_supportable_dr_alignment (struct data_reference *dr,
6496                                bool check_aligned_accesses)
6497 {
6498   gimple *stmt = DR_STMT (dr);
6499   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6500   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6501   machine_mode mode = TYPE_MODE (vectype);
6502   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6503   struct loop *vect_loop = NULL;
6504   bool nested_in_vect_loop = false;
6505
6506   if (aligned_access_p (dr) && !check_aligned_accesses)
6507     return dr_aligned;
6508
6509   /* For now assume all conditional loads/stores support unaligned
6510      access without any special code.  */
6511   if (is_gimple_call (stmt)
6512       && gimple_call_internal_p (stmt)
6513       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6514           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6515     return dr_unaligned_supported;
6516
6517   if (loop_vinfo)
6518     {
6519       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6520       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6521     }
6522
6523   /* Possibly unaligned access.  */
6524
6525   /* We can choose between using the implicit realignment scheme (generating
6526      a misaligned_move stmt) and the explicit realignment scheme (generating
6527      aligned loads with a REALIGN_LOAD).  There are two variants to the
6528      explicit realignment scheme: optimized, and unoptimized.
6529      We can optimize the realignment only if the step between consecutive
6530      vector loads is equal to the vector size.  Since the vector memory
6531      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6532      is guaranteed that the misalignment amount remains the same throughout the
6533      execution of the vectorized loop.  Therefore, we can create the
6534      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6535      at the loop preheader.
6536
6537      However, in the case of outer-loop vectorization, when vectorizing a
6538      memory access in the inner-loop nested within the LOOP that is now being
6539      vectorized, while it is guaranteed that the misalignment of the
6540      vectorized memory access will remain the same in different outer-loop
6541      iterations, it is *not* guaranteed that is will remain the same throughout
6542      the execution of the inner-loop.  This is because the inner-loop advances
6543      with the original scalar step (and not in steps of VS).  If the inner-loop
6544      step happens to be a multiple of VS, then the misalignment remains fixed
6545      and we can use the optimized realignment scheme.  For example:
6546
6547       for (i=0; i<N; i++)
6548         for (j=0; j<M; j++)
6549           s += a[i+j];
6550
6551      When vectorizing the i-loop in the above example, the step between
6552      consecutive vector loads is 1, and so the misalignment does not remain
6553      fixed across the execution of the inner-loop, and the realignment cannot
6554      be optimized (as illustrated in the following pseudo vectorized loop):
6555
6556       for (i=0; i<N; i+=4)
6557         for (j=0; j<M; j++){
6558           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6559                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6560                          // (assuming that we start from an aligned address).
6561           }
6562
6563      We therefore have to use the unoptimized realignment scheme:
6564
6565       for (i=0; i<N; i+=4)
6566           for (j=k; j<M; j+=4)
6567           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6568                            // that the misalignment of the initial address is
6569                            // 0).
6570
6571      The loop can then be vectorized as follows:
6572
6573       for (k=0; k<4; k++){
6574         rt = get_realignment_token (&vp[k]);
6575         for (i=0; i<N; i+=4){
6576           v1 = vp[i+k];
6577           for (j=k; j<M; j+=4){
6578             v2 = vp[i+j+VS-1];
6579             va = REALIGN_LOAD <v1,v2,rt>;
6580             vs += va;
6581             v1 = v2;
6582           }
6583         }
6584     } */
6585
6586   if (DR_IS_READ (dr))
6587     {
6588       bool is_packed = false;
6589       tree type = (TREE_TYPE (DR_REF (dr)));
6590
6591       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6592           && (!targetm.vectorize.builtin_mask_for_load
6593               || targetm.vectorize.builtin_mask_for_load ()))
6594         {
6595           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6596
6597           /* If we are doing SLP then the accesses need not have the
6598              same alignment, instead it depends on the SLP group size.  */
6599           if (loop_vinfo
6600               && STMT_SLP_TYPE (stmt_info)
6601               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6602                               * DR_GROUP_SIZE (vinfo_for_stmt
6603                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6604                               TYPE_VECTOR_SUBPARTS (vectype)))
6605             ;
6606           else if (!loop_vinfo
6607                    || (nested_in_vect_loop
6608                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6609                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6610             return dr_explicit_realign;
6611           else
6612             return dr_explicit_realign_optimized;
6613         }
6614       if (!known_alignment_for_access_p (dr))
6615         is_packed = not_size_aligned (DR_REF (dr));
6616
6617       if (targetm.vectorize.support_vector_misalignment
6618             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6619         /* Can't software pipeline the loads, but can at least do them.  */
6620         return dr_unaligned_supported;
6621     }
6622   else
6623     {
6624       bool is_packed = false;
6625       tree type = (TREE_TYPE (DR_REF (dr)));
6626
6627       if (!known_alignment_for_access_p (dr))
6628         is_packed = not_size_aligned (DR_REF (dr));
6629
6630      if (targetm.vectorize.support_vector_misalignment
6631            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6632        return dr_unaligned_supported;
6633     }
6634
6635   /* Unsupported.  */
6636   return dr_unaligned_unsupported;
6637 }