gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 216   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 217 }
 218
 219 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 220    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 221    distances.  These distances are conservatively correct but they don't
 222    reflect a guaranteed dependence.
 223
 224    Return true if this function does all the work necessary to avoid
 225    an alias or false if the caller should use the dependence distances
 226    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 227    the depth of the loop described by LOOP_VINFO and the other arguments
 228    are as for vect_analyze_data_ref_dependence.  */
 229
 230 static bool
 231 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 232                                        loop_vec_info loop_vinfo,
 233                                        int loop_depth, unsigned int *max_vf)
 234 {
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   lambda_vector dist_v;
 237   unsigned int i;
 238   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 239     {
 240       int dist = dist_v[loop_depth];
 241       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 242         {
 243           /* If the user asserted safelen >= DIST consecutive iterations
 244              can be executed concurrently, assume independence.
 245
 246              ??? An alternative would be to add the alias check even
 247              in this case, and vectorize the fallback loop with the
 248              maximum VF set to safelen.  However, if the user has
 249              explicitly given a length, it's less likely that that
 250              would be a win.  */
 251           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 252             {
 253               if ((unsigned int) loop->safelen < *max_vf)
 254                 *max_vf = loop->safelen;
 255               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 256               continue;
 257             }
 258
 259           /* For dependence distances of 2 or more, we have the option
 260              of limiting VF or checking for an alias at runtime.
 261              Prefer to check at runtime if we can, to avoid limiting
 262              the VF unnecessarily when the bases are in fact independent.
 263
 264              Note that the alias checks will be removed if the VF ends up
 265              being small enough.  */
 266           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 267         }
 268     }
 269   return true;
 270 }
 271
 272
 273 /* Function vect_analyze_data_ref_dependence.
 274
 275    Return TRUE if there (might) exist a dependence between a memory-reference
 276    DRA and a memory-reference DRB.  When versioning for alias may check a
 277    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 278    the data dependence.  */
 279
 280 static bool
 281 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 282                                   loop_vec_info loop_vinfo,
 283                                   unsigned int *max_vf)
 284 {
 285   unsigned int i;
 286   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 287   struct data_reference *dra = DDR_A (ddr);
 288   struct data_reference *drb = DDR_B (ddr);
 289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 291   lambda_vector dist_v;
 292   unsigned int loop_depth;
 293
 294   /* In loop analysis all data references should be vectorizable.  */
 295   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 296       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 297     gcc_unreachable ();
 298
 299   /* Independent data accesses.  */
 300   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 301     return false;
 302
 303   if (dra == drb
 304       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 305     return false;
 306
 307   /* We do not have to consider dependences between accesses that belong
 308      to the same group, unless the stride could be smaller than the
 309      group size.  */
 310   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 311       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 312           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 313       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 314     return false;
 315
 316   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 317      least two scalar iterations, there is always also a true dependence.
 318      As the vectorizer does not re-order loads and stores we can ignore
 319      the anti-dependence if TBAA can disambiguate both DRs similar to the
 320      case with known negative distance anti-dependences (positive
 321      distance anti-dependences would violate TBAA constraints).  */
 322   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 323        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 324       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 325                                  get_alias_set (DR_REF (drb))))
 326     return false;
 327
 328   /* Unknown data dependence.  */
 329   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 330     {
 331       /* If user asserted safelen consecutive iterations can be
 332          executed concurrently, assume independence.  */
 333       if (loop->safelen >= 2)
 334         {
 335           if ((unsigned int) loop->safelen < *max_vf)
 336             *max_vf = loop->safelen;
 337           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 338           return false;
 339         }
 340
 341       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 342           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 343         {
 344           if (dump_enabled_p ())
 345             {
 346               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 347                                "versioning for alias not supported for: "
 348                                "can't determine dependence between ");
 349               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 350                                  DR_REF (dra));
 351               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 352               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 353                                  DR_REF (drb));
 354               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 355             }
 356           return true;
 357         }
 358
 359       if (dump_enabled_p ())
 360         {
 361           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 362                            "versioning for alias required: "
 363                            "can't determine dependence between ");
 364           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 365                              DR_REF (dra));
 366           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 367           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 368                              DR_REF (drb));
 369           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 370         }
 371
 372       /* Add to list of ddrs that need to be tested at run-time.  */
 373       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 374     }
 375
 376   /* Known data dependence.  */
 377   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 378     {
 379       /* If user asserted safelen consecutive iterations can be
 380          executed concurrently, assume independence.  */
 381       if (loop->safelen >= 2)
 382         {
 383           if ((unsigned int) loop->safelen < *max_vf)
 384             *max_vf = loop->safelen;
 385           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 386           return false;
 387         }
 388
 389       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 390           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 391         {
 392           if (dump_enabled_p ())
 393             {
 394               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 395                                "versioning for alias not supported for: "
 396                                "bad dist vector for ");
 397               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 398                                  DR_REF (dra));
 399               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 400               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 401                                  DR_REF (drb));
 402               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 403             }
 404           return true;
 405         }
 406
 407       if (dump_enabled_p ())
 408         {
 409           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                            "versioning for alias required: "
 411                            "bad dist vector for ");
 412           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 413           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 414           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 415           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 416         }
 417       /* Add to list of ddrs that need to be tested at run-time.  */
 418       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 419     }
 420
 421   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 422
 423   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 424       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 425                                                 loop_depth, max_vf))
 426     return false;
 427
 428   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 429     {
 430       int dist = dist_v[loop_depth];
 431
 432       if (dump_enabled_p ())
 433         dump_printf_loc (MSG_NOTE, vect_location,
 434                          "dependence distance  = %d.\n", dist);
 435
 436       if (dist == 0)
 437         {
 438           if (dump_enabled_p ())
 439             {
 440               dump_printf_loc (MSG_NOTE, vect_location,
 441                                "dependence distance == 0 between ");
 442               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 443               dump_printf (MSG_NOTE, " and ");
 444               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 445               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 446             }
 447
 448           /* When we perform grouped accesses and perform implicit CSE
 449              by detecting equal accesses and doing disambiguation with
 450              runtime alias tests like for
 451                 .. = a[i];
 452                 .. = a[i+1];
 453                 a[i] = ..;
 454                 a[i+1] = ..;
 455                 *p = ..;
 456                 .. = a[i];
 457                 .. = a[i+1];
 458              where we will end up loading { a[i], a[i+1] } once, make
 459              sure that inserting group loads before the first load and
 460              stores after the last store will do the right thing.
 461              Similar for groups like
 462                 a[i] = ...;
 463                 ... = a[i];
 464                 a[i+1] = ...;
 465              where loads from the group interleave with the store.  */
 466           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 467             {
 468               if (dump_enabled_p ())
 469                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 470                                  "READ_WRITE dependence in interleaving.\n");
 471               return true;
 472             }
 473
 474           if (loop->safelen < 2)
 475             {
 476               tree indicator = dr_zero_step_indicator (dra);
 477               if (TREE_CODE (indicator) != INTEGER_CST)
 478                 vect_check_nonzero_value (loop_vinfo, indicator);
 479               else if (integer_zerop (indicator))
 480                 {
 481                   if (dump_enabled_p ())
 482                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 483                                  "access also has a zero step\n");
 484                   return true;
 485                 }
 486             }
 487           continue;
 488         }
 489
 490       if (dist > 0 && DDR_REVERSED_P (ddr))
 491         {
 492           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 493              reversed (to make distance vector positive), and the actual
 494              distance is negative.  */
 495           if (dump_enabled_p ())
 496             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 497                              "dependence distance negative.\n");
 498           /* Record a negative dependence distance to later limit the
 499              amount of stmt copying / unrolling we can perform.
 500              Only need to handle read-after-write dependence.  */
 501           if (DR_IS_READ (drb)
 502               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 503                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 504             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 505           continue;
 506         }
 507
 508       unsigned int abs_dist = abs (dist);
 509       if (abs_dist >= 2 && abs_dist < *max_vf)
 510         {
 511           /* The dependence distance requires reduction of the maximal
 512              vectorization factor.  */
 513           *max_vf = abs (dist);
 514           if (dump_enabled_p ())
 515             dump_printf_loc (MSG_NOTE, vect_location,
 516                              "adjusting maximal vectorization factor to %i\n",
 517                              *max_vf);
 518         }
 519
 520       if (abs_dist >= *max_vf)
 521         {
 522           /* Dependence distance does not create dependence, as far as
 523              vectorization is concerned, in this case.  */
 524           if (dump_enabled_p ())
 525             dump_printf_loc (MSG_NOTE, vect_location,
 526                              "dependence distance >= VF.\n");
 527           continue;
 528         }
 529
 530       if (dump_enabled_p ())
 531         {
 532           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 533                        "not vectorized, possible dependence "
 534                        "between data-refs ");
 535           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 536           dump_printf (MSG_NOTE,  " and ");
 537           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 538           dump_printf (MSG_NOTE,  "\n");
 539         }
 540
 541       return true;
 542     }
 543
 544   return false;
 545 }
 546
 547 /* Function vect_analyze_data_ref_dependences.
 548
 549    Examine all the data references in the loop, and make sure there do not
 550    exist any data dependences between them.  Set *MAX_VF according to
 551    the maximum vectorization factor the data dependences allow.  */
 552
 553 bool
 554 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 555                                    unsigned int *max_vf)
 556 {
 557   unsigned int i;
 558   struct data_dependence_relation *ddr;
 559
 560   if (dump_enabled_p ())
 561     dump_printf_loc (MSG_NOTE, vect_location,
 562                      "=== vect_analyze_data_ref_dependences ===\n");
 563
 564   LOOP_VINFO_DDRS (loop_vinfo)
 565     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 566              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 567   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 568   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 569   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 570                                 &LOOP_VINFO_DDRS (loop_vinfo),
 571                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 572     return false;
 573
 574   /* For epilogues we either have no aliases or alias versioning
 575      was applied to original loop.  Therefore we may just get max_vf
 576      using VF of original loop.  */
 577   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 578     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 579   else
 580     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 581       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 582         return false;
 583
 584   return true;
 585 }
 586
 587
 588 /* Function vect_slp_analyze_data_ref_dependence.
 589
 590    Return TRUE if there (might) exist a dependence between a memory-reference
 591    DRA and a memory-reference DRB.  When versioning for alias may check a
 592    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 593    the data dependence.  */
 594
 595 static bool
 596 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 597 {
 598   struct data_reference *dra = DDR_A (ddr);
 599   struct data_reference *drb = DDR_B (ddr);
 600
 601   /* We need to check dependences of statements marked as unvectorizable
 602      as well, they still can prohibit vectorization.  */
 603
 604   /* Independent data accesses.  */
 605   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 606     return false;
 607
 608   if (dra == drb)
 609     return false;
 610
 611   /* Read-read is OK.  */
 612   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 613     return false;
 614
 615   /* If dra and drb are part of the same interleaving chain consider
 616      them independent.  */
 617   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 618       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 619           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 620     return false;
 621
 622   /* Unknown data dependence.  */
 623   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 624     {
 625       if  (dump_enabled_p ())
 626         {
 627           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 628                            "can't determine dependence between ");
 629           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 630           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 631           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 632           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 633         }
 634     }
 635   else if (dump_enabled_p ())
 636     {
 637       dump_printf_loc (MSG_NOTE, vect_location,
 638                        "determined dependence between ");
 639       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 640       dump_printf (MSG_NOTE, " and ");
 641       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 642       dump_printf (MSG_NOTE,  "\n");
 643     }
 644
 645   return true;
 646 }
 647
 648
 649 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 650    contain the vector of scalar stores of this instance if we are
 651    disambiguating the loads.  */
 652
 653 static bool
 654 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 655                                    vec<gimple *> stores, gimple *last_store)
 656 {
 657   /* This walks over all stmts involved in the SLP load/store done
 658      in NODE verifying we can sink them up to the last stmt in the
 659      group.  */
 660   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 661   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 662     {
 663       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 664       if (access == last_access)
 665         continue;
 666       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 667       ao_ref ref;
 668       bool ref_initialized_p = false;
 669       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 670            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 671         {
 672           gimple *stmt = gsi_stmt (gsi);
 673           if (! gimple_vuse (stmt)
 674               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 675             continue;
 676
 677           /* If we couldn't record a (single) data reference for this
 678              stmt we have to resort to the alias oracle.  */
 679           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 680           if (!dr_b)
 681             {
 682               /* We are moving a store or sinking a load - this means
 683                  we cannot use TBAA for disambiguation.  */
 684               if (!ref_initialized_p)
 685                 ao_ref_init (&ref, DR_REF (dr_a));
 686               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 687                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 688                 return false;
 689               continue;
 690             }
 691
 692           bool dependent = false;
 693           /* If we run into a store of this same instance (we've just
 694              marked those) then delay dependence checking until we run
 695              into the last store because this is where it will have
 696              been sunk to (and we verify if we can do that as well).  */
 697           if (gimple_visited_p (stmt))
 698             {
 699               if (stmt != last_store)
 700                 continue;
 701               unsigned i;
 702               gimple *store;
 703               FOR_EACH_VEC_ELT (stores, i, store)
 704                 {
 705                   data_reference *store_dr
 706                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 707                   ddr_p ddr = initialize_data_dependence_relation
 708                                 (dr_a, store_dr, vNULL);
 709                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 710                   free_dependence_relation (ddr);
 711                   if (dependent)
 712                     break;
 713                 }
 714             }
 715           else
 716             {
 717               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 718                                                                dr_b, vNULL);
 719               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 720               free_dependence_relation (ddr);
 721             }
 722           if (dependent)
 723             return false;
 724         }
 725     }
 726   return true;
 727 }
 728
 729
 730 /* Function vect_analyze_data_ref_dependences.
 731
 732    Examine all the data references in the basic-block, and make sure there
 733    do not exist any data dependences between them.  Set *MAX_VF according to
 734    the maximum vectorization factor the data dependences allow.  */
 735
 736 bool
 737 vect_slp_analyze_instance_dependence (slp_instance instance)
 738 {
 739   if (dump_enabled_p ())
 740     dump_printf_loc (MSG_NOTE, vect_location,
 741                      "=== vect_slp_analyze_instance_dependence ===\n");
 742
 743   /* The stores of this instance are at the root of the SLP tree.  */
 744   slp_tree store = SLP_INSTANCE_TREE (instance);
 745   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 746     store = NULL;
 747
 748   /* Verify we can sink stores to the vectorized stmt insert location.  */
 749   gimple *last_store = NULL;
 750   if (store)
 751     {
 752       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 753         return false;
 754
 755       /* Mark stores in this instance and remember the last one.  */
 756       last_store = vect_find_last_scalar_stmt_in_slp (store);
 757       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 758         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 759     }
 760
 761   bool res = true;
 762
 763   /* Verify we can sink loads to the vectorized stmt insert location,
 764      special-casing stores of this instance.  */
 765   slp_tree load;
 766   unsigned int i;
 767   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 768     if (! vect_slp_analyze_node_dependences (instance, load,
 769                                              store
 770                                              ? SLP_TREE_SCALAR_STMTS (store)
 771                                              : vNULL, last_store))
 772       {
 773         res = false;
 774         break;
 775       }
 776
 777   /* Unset the visited flag.  */
 778   if (store)
 779     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 780       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 781
 782   return res;
 783 }
 784
 785 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 786    the statement that contains DRB, which is useful for recording in the
 787    dump file.  */
 788
 789 static void
 790 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 791                             innermost_loop_behavior *drb)
 792 {
 793   bool existed;
 794   innermost_loop_behavior *&entry
 795     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 796   if (!existed || entry->base_alignment < drb->base_alignment)
 797     {
 798       entry = drb;
 799       if (dump_enabled_p ())
 800         {
 801           dump_printf_loc (MSG_NOTE, vect_location,
 802                            "recording new base alignment for ");
 803           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 804           dump_printf (MSG_NOTE, "\n");
 805           dump_printf_loc (MSG_NOTE, vect_location,
 806                            "  alignment:    %d\n", drb->base_alignment);
 807           dump_printf_loc (MSG_NOTE, vect_location,
 808                            "  misalignment: %d\n", drb->base_misalignment);
 809           dump_printf_loc (MSG_NOTE, vect_location,
 810                            "  based on:     ");
 811           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 812         }
 813     }
 814 }
 815
 816 /* If the region we're going to vectorize is reached, all unconditional
 817    data references occur at least once.  We can therefore pool the base
 818    alignment guarantees from each unconditional reference.  Do this by
 819    going through all the data references in VINFO and checking whether
 820    the containing statement makes the reference unconditionally.  If so,
 821    record the alignment of the base address in VINFO so that it can be
 822    used for all other references with the same base.  */
 823
 824 void
 825 vect_record_base_alignments (vec_info *vinfo)
 826 {
 827   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 828   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 829   data_reference *dr;
 830   unsigned int i;
 831   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 832     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 833       {
 834         gimple *stmt = DR_STMT (dr);
 835         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 836
 837         /* If DR is nested in the loop that is being vectorized, we can also
 838            record the alignment of the base wrt the outer loop.  */
 839         if (loop && nested_in_vect_loop_p (loop, stmt))
 840           {
 841             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 842             vect_record_base_alignment
 843               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 844           }
 845       }
 846 }
 847
 848 /* Return the target alignment for the vectorized form of DR.  */
 849
 850 static unsigned int
 851 vect_calculate_target_alignment (struct data_reference *dr)
 852 {
 853   gimple *stmt = DR_STMT (dr);
 854   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 855   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 856   return targetm.vectorize.preferred_vector_alignment (vectype);
 857 }
 858
 859 /* Function vect_compute_data_ref_alignment
 860
 861    Compute the misalignment of the data reference DR.
 862
 863    Output:
 864    1. If during the misalignment computation it is found that the data reference
 865       cannot be vectorized then false is returned.
 866    2. DR_MISALIGNMENT (DR) is defined.
 867
 868    FOR NOW: No analysis is actually performed. Misalignment is calculated
 869    only for trivial cases. TODO.  */
 870
 871 bool
 872 vect_compute_data_ref_alignment (struct data_reference *dr)
 873 {
 874   gimple *stmt = DR_STMT (dr);
 875   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 876   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 877   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 878   struct loop *loop = NULL;
 879   tree ref = DR_REF (dr);
 880   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 881
 882   if (dump_enabled_p ())
 883     dump_printf_loc (MSG_NOTE, vect_location,
 884                      "vect_compute_data_ref_alignment:\n");
 885
 886   if (loop_vinfo)
 887     loop = LOOP_VINFO_LOOP (loop_vinfo);
 888
 889   /* Initialize misalignment to unknown.  */
 890   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 891
 892   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 893   bool step_preserves_misalignment_p;
 894
 895   unsigned HOST_WIDE_INT vector_alignment
 896     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 897   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 898
 899   /* No step for BB vectorization.  */
 900   if (!loop)
 901     {
 902       gcc_assert (integer_zerop (drb->step));
 903       step_preserves_misalignment_p = true;
 904     }
 905
 906   /* In case the dataref is in an inner-loop of the loop that is being
 907      vectorized (LOOP), we use the base and misalignment information
 908      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 909      stays the same throughout the execution of the inner-loop, which is why
 910      we have to check that the stride of the dataref in the inner-loop evenly
 911      divides by the vector alignment.  */
 912   else if (nested_in_vect_loop_p (loop, stmt))
 913     {
 914       step_preserves_misalignment_p
 915         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 916
 917       if (dump_enabled_p ())
 918         {
 919           if (step_preserves_misalignment_p)
 920             dump_printf_loc (MSG_NOTE, vect_location,
 921                              "inner step divides the vector alignment.\n");
 922           else
 923             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 924                              "inner step doesn't divide the vector"
 925                              " alignment.\n");
 926         }
 927     }
 928
 929   /* Similarly we can only use base and misalignment information relative to
 930      an innermost loop if the misalignment stays the same throughout the
 931      execution of the loop.  As above, this is the case if the stride of
 932      the dataref evenly divides by the alignment.  */
 933   else
 934     {
 935       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 936       step_preserves_misalignment_p
 937         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 938
 939       if (!step_preserves_misalignment_p && dump_enabled_p ())
 940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 941                          "step doesn't divide the vector alignment.\n");
 942     }
 943
 944   unsigned int base_alignment = drb->base_alignment;
 945   unsigned int base_misalignment = drb->base_misalignment;
 946
 947   /* Calculate the maximum of the pooled base address alignment and the
 948      alignment that we can compute for DR itself.  */
 949   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 950   if (entry && base_alignment < (*entry)->base_alignment)
 951     {
 952       base_alignment = (*entry)->base_alignment;
 953       base_misalignment = (*entry)->base_misalignment;
 954     }
 955
 956   if (drb->offset_alignment < vector_alignment
 957       || !step_preserves_misalignment_p
 958       /* We need to know whether the step wrt the vectorized loop is
 959          negative when computing the starting misalignment below.  */
 960       || TREE_CODE (drb->step) != INTEGER_CST)
 961     {
 962       if (dump_enabled_p ())
 963         {
 964           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 965                            "Unknown alignment for access: ");
 966           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 967           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 968         }
 969       return true;
 970     }
 971
 972   if (base_alignment < vector_alignment)
 973     {
 974       unsigned int max_alignment;
 975       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 976       if (max_alignment < vector_alignment
 977           || !vect_can_force_dr_alignment_p (base,
 978                                              vector_alignment * BITS_PER_UNIT))
 979         {
 980           if (dump_enabled_p ())
 981             {
 982               dump_printf_loc (MSG_NOTE, vect_location,
 983                                "can't force alignment of ref: ");
 984               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 985               dump_printf (MSG_NOTE, "\n");
 986             }
 987           return true;
 988         }
 989
 990       /* Force the alignment of the decl.
 991          NOTE: This is the only change to the code we make during
 992          the analysis phase, before deciding to vectorize the loop.  */
 993       if (dump_enabled_p ())
 994         {
 995           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 996           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 997           dump_printf (MSG_NOTE, "\n");
 998         }
 999
1000       DR_VECT_AUX (dr)->base_decl = base;
1001       DR_VECT_AUX (dr)->base_misaligned = true;
1002       base_misalignment = 0;
1003     }
1004   poly_int64 misalignment
1005     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1006
1007   /* If this is a backward running DR then first access in the larger
1008      vectype actually is N-1 elements before the address in the DR.
1009      Adjust misalign accordingly.  */
1010   if (tree_int_cst_sgn (drb->step) < 0)
1011     /* PLUS because STEP is negative.  */
1012     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1013                      * TREE_INT_CST_LOW (drb->step));
1014
1015   unsigned int const_misalignment;
1016   if (!known_misalignment (misalignment, vector_alignment,
1017                            &const_misalignment))
1018     {
1019       if (dump_enabled_p ())
1020         {
1021           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1022                            "Non-constant misalignment for access: ");
1023           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1024           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1025         }
1026       return true;
1027     }
1028
1029   SET_DR_MISALIGNMENT (dr, const_misalignment);
1030
1031   if (dump_enabled_p ())
1032     {
1033       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1035       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1036       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1037     }
1038
1039   return true;
1040 }
1041
1042 /* Function vect_update_misalignment_for_peel.
1043    Sets DR's misalignment
1044    - to 0 if it has the same alignment as DR_PEEL,
1045    - to the misalignment computed using NPEEL if DR's salignment is known,
1046    - to -1 (unknown) otherwise.
1047
1048    DR - the data reference whose misalignment is to be adjusted.
1049    DR_PEEL - the data reference whose misalignment is being made
1050              zero in the vector loop by the peel.
1051    NPEEL - the number of iterations in the peel loop if the misalignment
1052            of DR_PEEL is known at compile time.  */
1053
1054 static void
1055 vect_update_misalignment_for_peel (struct data_reference *dr,
1056                                    struct data_reference *dr_peel, int npeel)
1057 {
1058   unsigned int i;
1059   vec<dr_p> same_aligned_drs;
1060   struct data_reference *current_dr;
1061   int dr_size = vect_get_scalar_dr_size (dr);
1062   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1063   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1064   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1065
1066  /* For interleaved data accesses the step in the loop must be multiplied by
1067      the size of the interleaving group.  */
1068   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1069     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1070   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1071     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1072
1073   /* It can be assumed that the data refs with the same alignment as dr_peel
1074      are aligned in the vector loop.  */
1075   same_aligned_drs
1076     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1077   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1078     {
1079       if (current_dr != dr)
1080         continue;
1081       gcc_assert (!known_alignment_for_access_p (dr)
1082                   || !known_alignment_for_access_p (dr_peel)
1083                   || (DR_MISALIGNMENT (dr) / dr_size
1084                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1085       SET_DR_MISALIGNMENT (dr, 0);
1086       return;
1087     }
1088
1089   if (known_alignment_for_access_p (dr)
1090       && known_alignment_for_access_p (dr_peel))
1091     {
1092       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1093       int misal = DR_MISALIGNMENT (dr);
1094       misal += negative ? -npeel * dr_size : npeel * dr_size;
1095       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1096       SET_DR_MISALIGNMENT (dr, misal);
1097       return;
1098     }
1099
1100   if (dump_enabled_p ())
1101     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1102                      "to unknown (-1).\n");
1103   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1104 }
1105
1106
1107 /* Function verify_data_ref_alignment
1108
1109    Return TRUE if DR can be handled with respect to alignment.  */
1110
1111 static bool
1112 verify_data_ref_alignment (data_reference_p dr)
1113 {
1114   enum dr_alignment_support supportable_dr_alignment
1115     = vect_supportable_dr_alignment (dr, false);
1116   if (!supportable_dr_alignment)
1117     {
1118       if (dump_enabled_p ())
1119         {
1120           if (DR_IS_READ (dr))
1121             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1122                              "not vectorized: unsupported unaligned load.");
1123           else
1124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125                              "not vectorized: unsupported unaligned "
1126                              "store.");
1127
1128           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1129                              DR_REF (dr));
1130           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1131         }
1132       return false;
1133     }
1134
1135   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1136     dump_printf_loc (MSG_NOTE, vect_location,
1137                      "Vectorizing an unaligned access.\n");
1138
1139   return true;
1140 }
1141
1142 /* Function vect_verify_datarefs_alignment
1143
1144    Return TRUE if all data references in the loop can be
1145    handled with respect to alignment.  */
1146
1147 bool
1148 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1149 {
1150   vec<data_reference_p> datarefs = vinfo->datarefs;
1151   struct data_reference *dr;
1152   unsigned int i;
1153
1154   FOR_EACH_VEC_ELT (datarefs, i, dr)
1155     {
1156       gimple *stmt = DR_STMT (dr);
1157       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1158
1159       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1160         continue;
1161
1162       /* For interleaving, only the alignment of the first access matters.   */
1163       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1164           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1165         continue;
1166
1167       /* Strided accesses perform only component accesses, alignment is
1168          irrelevant for them.  */
1169       if (STMT_VINFO_STRIDED_P (stmt_info)
1170           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1171         continue;
1172
1173       if (! verify_data_ref_alignment (dr))
1174         return false;
1175     }
1176
1177   return true;
1178 }
1179
1180 /* Given an memory reference EXP return whether its alignment is less
1181    than its size.  */
1182
1183 static bool
1184 not_size_aligned (tree exp)
1185 {
1186   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1187     return true;
1188
1189   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1190           > get_object_alignment (exp));
1191 }
1192
1193 /* Function vector_alignment_reachable_p
1194
1195    Return true if vector alignment for DR is reachable by peeling
1196    a few loop iterations.  Return false otherwise.  */
1197
1198 static bool
1199 vector_alignment_reachable_p (struct data_reference *dr)
1200 {
1201   gimple *stmt = DR_STMT (dr);
1202   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1203   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1204
1205   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1206     {
1207       /* For interleaved access we peel only if number of iterations in
1208          the prolog loop ({VF - misalignment}), is a multiple of the
1209          number of the interleaved accesses.  */
1210       int elem_size, mis_in_elements;
1211
1212       /* FORNOW: handle only known alignment.  */
1213       if (!known_alignment_for_access_p (dr))
1214         return false;
1215
1216       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1217       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1218       elem_size = vector_element_size (vector_size, nelements);
1219       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1220
1221       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1222         return false;
1223     }
1224
1225   /* If misalignment is known at the compile time then allow peeling
1226      only if natural alignment is reachable through peeling.  */
1227   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1228     {
1229       HOST_WIDE_INT elmsize =
1230                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1231       if (dump_enabled_p ())
1232         {
1233           dump_printf_loc (MSG_NOTE, vect_location,
1234                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1235           dump_printf (MSG_NOTE,
1236                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1237         }
1238       if (DR_MISALIGNMENT (dr) % elmsize)
1239         {
1240           if (dump_enabled_p ())
1241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1242                              "data size does not divide the misalignment.\n");
1243           return false;
1244         }
1245     }
1246
1247   if (!known_alignment_for_access_p (dr))
1248     {
1249       tree type = TREE_TYPE (DR_REF (dr));
1250       bool is_packed = not_size_aligned (DR_REF (dr));
1251       if (dump_enabled_p ())
1252         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1253                          "Unknown misalignment, %snaturally aligned\n",
1254                          is_packed ? "not " : "");
1255       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1256     }
1257
1258   return true;
1259 }
1260
1261
1262 /* Calculate the cost of the memory access represented by DR.  */
1263
1264 static void
1265 vect_get_data_access_cost (struct data_reference *dr,
1266                            unsigned int *inside_cost,
1267                            unsigned int *outside_cost,
1268                            stmt_vector_for_cost *body_cost_vec,
1269                            stmt_vector_for_cost *prologue_cost_vec)
1270 {
1271   gimple *stmt = DR_STMT (dr);
1272   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1273   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1274   int ncopies;
1275
1276   if (PURE_SLP_STMT (stmt_info))
1277     ncopies = 1;
1278   else
1279     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1280
1281   if (DR_IS_READ (dr))
1282     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1283                         prologue_cost_vec, body_cost_vec, false);
1284   else
1285     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1286
1287   if (dump_enabled_p ())
1288     dump_printf_loc (MSG_NOTE, vect_location,
1289                      "vect_get_data_access_cost: inside_cost = %d, "
1290                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1291 }
1292
1293
1294 typedef struct _vect_peel_info
1295 {
1296   struct data_reference *dr;
1297   int npeel;
1298   unsigned int count;
1299 } *vect_peel_info;
1300
1301 typedef struct _vect_peel_extended_info
1302 {
1303   struct _vect_peel_info peel_info;
1304   unsigned int inside_cost;
1305   unsigned int outside_cost;
1306 } *vect_peel_extended_info;
1307
1308
1309 /* Peeling hashtable helpers.  */
1310
1311 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1312 {
1313   static inline hashval_t hash (const _vect_peel_info *);
1314   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1315 };
1316
1317 inline hashval_t
1318 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1319 {
1320   return (hashval_t) peel_info->npeel;
1321 }
1322
1323 inline bool
1324 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1325 {
1326   return (a->npeel == b->npeel);
1327 }
1328
1329
1330 /* Insert DR into peeling hash table with NPEEL as key.  */
1331
1332 static void
1333 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1334                           loop_vec_info loop_vinfo, struct data_reference *dr,
1335                           int npeel)
1336 {
1337   struct _vect_peel_info elem, *slot;
1338   _vect_peel_info **new_slot;
1339   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1340
1341   elem.npeel = npeel;
1342   slot = peeling_htab->find (&elem);
1343   if (slot)
1344     slot->count++;
1345   else
1346     {
1347       slot = XNEW (struct _vect_peel_info);
1348       slot->npeel = npeel;
1349       slot->dr = dr;
1350       slot->count = 1;
1351       new_slot = peeling_htab->find_slot (slot, INSERT);
1352       *new_slot = slot;
1353     }
1354
1355   if (!supportable_dr_alignment
1356       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1357     slot->count += VECT_MAX_COST;
1358 }
1359
1360
1361 /* Traverse peeling hash table to find peeling option that aligns maximum
1362    number of data accesses.  */
1363
1364 int
1365 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1366                                      _vect_peel_extended_info *max)
1367 {
1368   vect_peel_info elem = *slot;
1369
1370   if (elem->count > max->peel_info.count
1371       || (elem->count == max->peel_info.count
1372           && max->peel_info.npeel > elem->npeel))
1373     {
1374       max->peel_info.npeel = elem->npeel;
1375       max->peel_info.count = elem->count;
1376       max->peel_info.dr = elem->dr;
1377     }
1378
1379   return 1;
1380 }
1381
1382 /* Get the costs of peeling NPEEL iterations checking data access costs
1383    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1384    misalignment will be zero after peeling.  */
1385
1386 static void
1387 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1388                                 struct data_reference *dr0,
1389                                 unsigned int *inside_cost,
1390                                 unsigned int *outside_cost,
1391                                 stmt_vector_for_cost *body_cost_vec,
1392                                 stmt_vector_for_cost *prologue_cost_vec,
1393                                 unsigned int npeel,
1394                                 bool unknown_misalignment)
1395 {
1396   unsigned i;
1397   data_reference *dr;
1398
1399   FOR_EACH_VEC_ELT (datarefs, i, dr)
1400     {
1401       gimple *stmt = DR_STMT (dr);
1402       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1403       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1404         continue;
1405
1406       /* For interleaving, only the alignment of the first access
1407          matters.  */
1408       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1409           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1410         continue;
1411
1412       /* Strided accesses perform only component accesses, alignment is
1413          irrelevant for them.  */
1414       if (STMT_VINFO_STRIDED_P (stmt_info)
1415           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1416         continue;
1417
1418       int save_misalignment;
1419       save_misalignment = DR_MISALIGNMENT (dr);
1420       if (npeel == 0)
1421         ;
1422       else if (unknown_misalignment && dr == dr0)
1423         SET_DR_MISALIGNMENT (dr, 0);
1424       else
1425         vect_update_misalignment_for_peel (dr, dr0, npeel);
1426       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1427                                  body_cost_vec, prologue_cost_vec);
1428       SET_DR_MISALIGNMENT (dr, save_misalignment);
1429     }
1430 }
1431
1432 /* Traverse peeling hash table and calculate cost for each peeling option.
1433    Find the one with the lowest cost.  */
1434
1435 int
1436 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1437                                    _vect_peel_extended_info *min)
1438 {
1439   vect_peel_info elem = *slot;
1440   int dummy;
1441   unsigned int inside_cost = 0, outside_cost = 0;
1442   gimple *stmt = DR_STMT (elem->dr);
1443   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1444   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1445   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1446                        epilogue_cost_vec;
1447
1448   prologue_cost_vec.create (2);
1449   body_cost_vec.create (2);
1450   epilogue_cost_vec.create (2);
1451
1452   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1453                                   elem->dr, &inside_cost, &outside_cost,
1454                                   &body_cost_vec, &prologue_cost_vec,
1455                                   elem->npeel, false);
1456
1457   body_cost_vec.release ();
1458
1459   outside_cost += vect_get_known_peeling_cost
1460     (loop_vinfo, elem->npeel, &dummy,
1461      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1462      &prologue_cost_vec, &epilogue_cost_vec);
1463
1464   /* Prologue and epilogue costs are added to the target model later.
1465      These costs depend only on the scalar iteration cost, the
1466      number of peeling iterations finally chosen, and the number of
1467      misaligned statements.  So discard the information found here.  */
1468   prologue_cost_vec.release ();
1469   epilogue_cost_vec.release ();
1470
1471   if (inside_cost < min->inside_cost
1472       || (inside_cost == min->inside_cost
1473           && outside_cost < min->outside_cost))
1474     {
1475       min->inside_cost = inside_cost;
1476       min->outside_cost = outside_cost;
1477       min->peel_info.dr = elem->dr;
1478       min->peel_info.npeel = elem->npeel;
1479       min->peel_info.count = elem->count;
1480     }
1481
1482   return 1;
1483 }
1484
1485
1486 /* Choose best peeling option by traversing peeling hash table and either
1487    choosing an option with the lowest cost (if cost model is enabled) or the
1488    option that aligns as many accesses as possible.  */
1489
1490 static struct _vect_peel_extended_info
1491 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1492                                        loop_vec_info loop_vinfo)
1493 {
1494    struct _vect_peel_extended_info res;
1495
1496    res.peel_info.dr = NULL;
1497
1498    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1499      {
1500        res.inside_cost = INT_MAX;
1501        res.outside_cost = INT_MAX;
1502        peeling_htab->traverse <_vect_peel_extended_info *,
1503                                vect_peeling_hash_get_lowest_cost> (&res);
1504      }
1505    else
1506      {
1507        res.peel_info.count = 0;
1508        peeling_htab->traverse <_vect_peel_extended_info *,
1509                                vect_peeling_hash_get_most_frequent> (&res);
1510        res.inside_cost = 0;
1511        res.outside_cost = 0;
1512      }
1513
1514    return res;
1515 }
1516
1517 /* Return true if the new peeling NPEEL is supported.  */
1518
1519 static bool
1520 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1521                           unsigned npeel)
1522 {
1523   unsigned i;
1524   struct data_reference *dr = NULL;
1525   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1526   gimple *stmt;
1527   stmt_vec_info stmt_info;
1528   enum dr_alignment_support supportable_dr_alignment;
1529
1530   /* Ensure that all data refs can be vectorized after the peel.  */
1531   FOR_EACH_VEC_ELT (datarefs, i, dr)
1532     {
1533       int save_misalignment;
1534
1535       if (dr == dr0)
1536         continue;
1537
1538       stmt = DR_STMT (dr);
1539       stmt_info = vinfo_for_stmt (stmt);
1540       /* For interleaving, only the alignment of the first access
1541          matters.  */
1542       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1543           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1544         continue;
1545
1546       /* Strided accesses perform only component accesses, alignment is
1547          irrelevant for them.  */
1548       if (STMT_VINFO_STRIDED_P (stmt_info)
1549           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1550         continue;
1551
1552       save_misalignment = DR_MISALIGNMENT (dr);
1553       vect_update_misalignment_for_peel (dr, dr0, npeel);
1554       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1555       SET_DR_MISALIGNMENT (dr, save_misalignment);
1556
1557       if (!supportable_dr_alignment)
1558         return false;
1559     }
1560
1561   return true;
1562 }
1563
1564 /* Function vect_enhance_data_refs_alignment
1565
1566    This pass will use loop versioning and loop peeling in order to enhance
1567    the alignment of data references in the loop.
1568
1569    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1570    original loop is to be vectorized.  Any other loops that are created by
1571    the transformations performed in this pass - are not supposed to be
1572    vectorized.  This restriction will be relaxed.
1573
1574    This pass will require a cost model to guide it whether to apply peeling
1575    or versioning or a combination of the two.  For example, the scheme that
1576    intel uses when given a loop with several memory accesses, is as follows:
1577    choose one memory access ('p') which alignment you want to force by doing
1578    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1579    other accesses are not necessarily aligned, or (2) use loop versioning to
1580    generate one loop in which all accesses are aligned, and another loop in
1581    which only 'p' is necessarily aligned.
1582
1583    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1584    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1585    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1586
1587    Devising a cost model is the most critical aspect of this work.  It will
1588    guide us on which access to peel for, whether to use loop versioning, how
1589    many versions to create, etc.  The cost model will probably consist of
1590    generic considerations as well as target specific considerations (on
1591    powerpc for example, misaligned stores are more painful than misaligned
1592    loads).
1593
1594    Here are the general steps involved in alignment enhancements:
1595
1596      -- original loop, before alignment analysis:
1597         for (i=0; i<N; i++){
1598           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1599           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1600         }
1601
1602      -- After vect_compute_data_refs_alignment:
1603         for (i=0; i<N; i++){
1604           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1605           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1606         }
1607
1608      -- Possibility 1: we do loop versioning:
1609      if (p is aligned) {
1610         for (i=0; i<N; i++){    # loop 1A
1611           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1612           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1613         }
1614      }
1615      else {
1616         for (i=0; i<N; i++){    # loop 1B
1617           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1618           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1619         }
1620      }
1621
1622      -- Possibility 2: we do loop peeling:
1623      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1624         x = q[i];
1625         p[i] = y;
1626      }
1627      for (i = 3; i < N; i++){   # loop 2A
1628         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1629         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1630      }
1631
1632      -- Possibility 3: combination of loop peeling and versioning:
1633      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1634         x = q[i];
1635         p[i] = y;
1636      }
1637      if (p is aligned) {
1638         for (i = 3; i<N; i++){  # loop 3A
1639           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1640           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1641         }
1642      }
1643      else {
1644         for (i = 3; i<N; i++){  # loop 3B
1645           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1646           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1647         }
1648      }
1649
1650      These loops are later passed to loop_transform to be vectorized.  The
1651      vectorizer will use the alignment information to guide the transformation
1652      (whether to generate regular loads/stores, or with special handling for
1653      misalignment).  */
1654
1655 bool
1656 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1657 {
1658   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1659   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660   enum dr_alignment_support supportable_dr_alignment;
1661   struct data_reference *dr0 = NULL, *first_store = NULL;
1662   struct data_reference *dr;
1663   unsigned int i, j;
1664   bool do_peeling = false;
1665   bool do_versioning = false;
1666   bool stat;
1667   gimple *stmt;
1668   stmt_vec_info stmt_info;
1669   unsigned int npeel = 0;
1670   bool one_misalignment_known = false;
1671   bool one_misalignment_unknown = false;
1672   bool one_dr_unsupportable = false;
1673   struct data_reference *unsupportable_dr = NULL;
1674   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1675   unsigned possible_npeel_number = 1;
1676   tree vectype;
1677   unsigned int mis, same_align_drs_max = 0;
1678   hash_table<peel_info_hasher> peeling_htab (1);
1679
1680   if (dump_enabled_p ())
1681     dump_printf_loc (MSG_NOTE, vect_location,
1682                      "=== vect_enhance_data_refs_alignment ===\n");
1683
1684   /* Reset data so we can safely be called multiple times.  */
1685   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1686   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1687
1688   /* While cost model enhancements are expected in the future, the high level
1689      view of the code at this time is as follows:
1690
1691      A) If there is a misaligned access then see if peeling to align
1692         this access can make all data references satisfy
1693         vect_supportable_dr_alignment.  If so, update data structures
1694         as needed and return true.
1695
1696      B) If peeling wasn't possible and there is a data reference with an
1697         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1698         then see if loop versioning checks can be used to make all data
1699         references satisfy vect_supportable_dr_alignment.  If so, update
1700         data structures as needed and return true.
1701
1702      C) If neither peeling nor versioning were successful then return false if
1703         any data reference does not satisfy vect_supportable_dr_alignment.
1704
1705      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1706
1707      Note, Possibility 3 above (which is peeling and versioning together) is not
1708      being done at this time.  */
1709
1710   /* (1) Peeling to force alignment.  */
1711
1712   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1713      Considerations:
1714      + How many accesses will become aligned due to the peeling
1715      - How many accesses will become unaligned due to the peeling,
1716        and the cost of misaligned accesses.
1717      - The cost of peeling (the extra runtime checks, the increase
1718        in code size).  */
1719
1720   FOR_EACH_VEC_ELT (datarefs, i, dr)
1721     {
1722       stmt = DR_STMT (dr);
1723       stmt_info = vinfo_for_stmt (stmt);
1724
1725       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1726         continue;
1727
1728       /* For interleaving, only the alignment of the first access
1729          matters.  */
1730       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1731           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1732         continue;
1733
1734       /* For invariant accesses there is nothing to enhance.  */
1735       if (integer_zerop (DR_STEP (dr)))
1736         continue;
1737
1738       /* Strided accesses perform only component accesses, alignment is
1739          irrelevant for them.  */
1740       if (STMT_VINFO_STRIDED_P (stmt_info)
1741           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1742         continue;
1743
1744       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1745       do_peeling = vector_alignment_reachable_p (dr);
1746       if (do_peeling)
1747         {
1748           if (known_alignment_for_access_p (dr))
1749             {
1750               unsigned int npeel_tmp = 0;
1751               bool negative = tree_int_cst_compare (DR_STEP (dr),
1752                                                     size_zero_node) < 0;
1753
1754               vectype = STMT_VINFO_VECTYPE (stmt_info);
1755               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1756               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1757               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1758               if (DR_MISALIGNMENT (dr) != 0)
1759                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1760
1761               /* For multiple types, it is possible that the bigger type access
1762                  will have more than one peeling option.  E.g., a loop with two
1763                  types: one of size (vector size / 4), and the other one of
1764                  size (vector size / 8).  Vectorization factor will 8.  If both
1765                  accesses are misaligned by 3, the first one needs one scalar
1766                  iteration to be aligned, and the second one needs 5.  But the
1767                  first one will be aligned also by peeling 5 scalar
1768                  iterations, and in that case both accesses will be aligned.
1769                  Hence, except for the immediate peeling amount, we also want
1770                  to try to add full vector size, while we don't exceed
1771                  vectorization factor.
1772                  We do this automatically for cost model, since we calculate
1773                  cost for every peeling option.  */
1774               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1775                 {
1776                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1777                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1778                   possible_npeel_number
1779                     = vect_get_num_vectors (nscalars, vectype);
1780
1781                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1782                      allow peeling NELEMENTS.  */
1783                   if (DR_MISALIGNMENT (dr) == 0)
1784                     possible_npeel_number++;
1785                 }
1786
1787               /* Save info about DR in the hash table.  Also include peeling
1788                  amounts according to the explanation above.  */
1789               for (j = 0; j < possible_npeel_number; j++)
1790                 {
1791                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1792                                             dr, npeel_tmp);
1793                   npeel_tmp += target_align / dr_size;
1794                 }
1795
1796               one_misalignment_known = true;
1797             }
1798           else
1799             {
1800               /* If we don't know any misalignment values, we prefer
1801                  peeling for data-ref that has the maximum number of data-refs
1802                  with the same alignment, unless the target prefers to align
1803                  stores over load.  */
1804               unsigned same_align_drs
1805                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1806               if (!dr0
1807                   || same_align_drs_max < same_align_drs)
1808                 {
1809                   same_align_drs_max = same_align_drs;
1810                   dr0 = dr;
1811                 }
1812               /* For data-refs with the same number of related
1813                  accesses prefer the one where the misalign
1814                  computation will be invariant in the outermost loop.  */
1815               else if (same_align_drs_max == same_align_drs)
1816                 {
1817                   struct loop *ivloop0, *ivloop;
1818                   ivloop0 = outermost_invariant_loop_for_expr
1819                     (loop, DR_BASE_ADDRESS (dr0));
1820                   ivloop = outermost_invariant_loop_for_expr
1821                     (loop, DR_BASE_ADDRESS (dr));
1822                   if ((ivloop && !ivloop0)
1823                       || (ivloop && ivloop0
1824                           && flow_loop_nested_p (ivloop, ivloop0)))
1825                     dr0 = dr;
1826                 }
1827
1828               one_misalignment_unknown = true;
1829
1830               /* Check for data refs with unsupportable alignment that
1831                  can be peeled.  */
1832               if (!supportable_dr_alignment)
1833               {
1834                 one_dr_unsupportable = true;
1835                 unsupportable_dr = dr;
1836               }
1837
1838               if (!first_store && DR_IS_WRITE (dr))
1839                 first_store = dr;
1840             }
1841         }
1842       else
1843         {
1844           if (!aligned_access_p (dr))
1845             {
1846               if (dump_enabled_p ())
1847                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                                  "vector alignment may not be reachable\n");
1849               break;
1850             }
1851         }
1852     }
1853
1854   /* Check if we can possibly peel the loop.  */
1855   if (!vect_can_advance_ivs_p (loop_vinfo)
1856       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1857       || loop->inner)
1858     do_peeling = false;
1859
1860   struct _vect_peel_extended_info peel_for_known_alignment;
1861   struct _vect_peel_extended_info peel_for_unknown_alignment;
1862   struct _vect_peel_extended_info best_peel;
1863
1864   peel_for_unknown_alignment.inside_cost = INT_MAX;
1865   peel_for_unknown_alignment.outside_cost = INT_MAX;
1866   peel_for_unknown_alignment.peel_info.count = 0;
1867
1868   if (do_peeling
1869       && one_misalignment_unknown)
1870     {
1871       /* Check if the target requires to prefer stores over loads, i.e., if
1872          misaligned stores are more expensive than misaligned loads (taking
1873          drs with same alignment into account).  */
1874       unsigned int load_inside_cost = 0;
1875       unsigned int load_outside_cost = 0;
1876       unsigned int store_inside_cost = 0;
1877       unsigned int store_outside_cost = 0;
1878       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1879
1880       stmt_vector_for_cost dummy;
1881       dummy.create (2);
1882       vect_get_peeling_costs_all_drs (datarefs, dr0,
1883                                       &load_inside_cost,
1884                                       &load_outside_cost,
1885                                       &dummy, &dummy, estimated_npeels, true);
1886       dummy.release ();
1887
1888       if (first_store)
1889         {
1890           dummy.create (2);
1891           vect_get_peeling_costs_all_drs (datarefs, first_store,
1892                                           &store_inside_cost,
1893                                           &store_outside_cost,
1894                                           &dummy, &dummy,
1895                                           estimated_npeels, true);
1896           dummy.release ();
1897         }
1898       else
1899         {
1900           store_inside_cost = INT_MAX;
1901           store_outside_cost = INT_MAX;
1902         }
1903
1904       if (load_inside_cost > store_inside_cost
1905           || (load_inside_cost == store_inside_cost
1906               && load_outside_cost > store_outside_cost))
1907         {
1908           dr0 = first_store;
1909           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1910           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1911         }
1912       else
1913         {
1914           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1915           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1916         }
1917
1918       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1919       prologue_cost_vec.create (2);
1920       epilogue_cost_vec.create (2);
1921
1922       int dummy2;
1923       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1924         (loop_vinfo, estimated_npeels, &dummy2,
1925          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1926          &prologue_cost_vec, &epilogue_cost_vec);
1927
1928       prologue_cost_vec.release ();
1929       epilogue_cost_vec.release ();
1930
1931       peel_for_unknown_alignment.peel_info.count = 1
1932         + STMT_VINFO_SAME_ALIGN_REFS
1933         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1934     }
1935
1936   peel_for_unknown_alignment.peel_info.npeel = 0;
1937   peel_for_unknown_alignment.peel_info.dr = dr0;
1938
1939   best_peel = peel_for_unknown_alignment;
1940
1941   peel_for_known_alignment.inside_cost = INT_MAX;
1942   peel_for_known_alignment.outside_cost = INT_MAX;
1943   peel_for_known_alignment.peel_info.count = 0;
1944   peel_for_known_alignment.peel_info.dr = NULL;
1945
1946   if (do_peeling && one_misalignment_known)
1947     {
1948       /* Peeling is possible, but there is no data access that is not supported
1949          unless aligned.  So we try to choose the best possible peeling from
1950          the hash table.  */
1951       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1952         (&peeling_htab, loop_vinfo);
1953     }
1954
1955   /* Compare costs of peeling for known and unknown alignment. */
1956   if (peel_for_known_alignment.peel_info.dr != NULL
1957       && peel_for_unknown_alignment.inside_cost
1958       >= peel_for_known_alignment.inside_cost)
1959     {
1960       best_peel = peel_for_known_alignment;
1961
1962       /* If the best peeling for known alignment has NPEEL == 0, perform no
1963          peeling at all except if there is an unsupportable dr that we can
1964          align.  */
1965       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1966         do_peeling = false;
1967     }
1968
1969   /* If there is an unsupportable data ref, prefer this over all choices so far
1970      since we'd have to discard a chosen peeling except when it accidentally
1971      aligned the unsupportable data ref.  */
1972   if (one_dr_unsupportable)
1973     dr0 = unsupportable_dr;
1974   else if (do_peeling)
1975     {
1976       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1977          TODO: Use nopeel_outside_cost or get rid of it?  */
1978       unsigned nopeel_inside_cost = 0;
1979       unsigned nopeel_outside_cost = 0;
1980
1981       stmt_vector_for_cost dummy;
1982       dummy.create (2);
1983       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1984                                       &nopeel_outside_cost, &dummy, &dummy,
1985                                       0, false);
1986       dummy.release ();
1987
1988       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1989          costs will be recorded.  */
1990       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1991       prologue_cost_vec.create (2);
1992       epilogue_cost_vec.create (2);
1993
1994       int dummy2;
1995       nopeel_outside_cost += vect_get_known_peeling_cost
1996         (loop_vinfo, 0, &dummy2,
1997          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1998          &prologue_cost_vec, &epilogue_cost_vec);
1999
2000       prologue_cost_vec.release ();
2001       epilogue_cost_vec.release ();
2002
2003       npeel = best_peel.peel_info.npeel;
2004       dr0 = best_peel.peel_info.dr;
2005
2006       /* If no peeling is not more expensive than the best peeling we
2007          have so far, don't perform any peeling.  */
2008       if (nopeel_inside_cost <= best_peel.inside_cost)
2009         do_peeling = false;
2010     }
2011
2012   if (do_peeling)
2013     {
2014       stmt = DR_STMT (dr0);
2015       stmt_info = vinfo_for_stmt (stmt);
2016       vectype = STMT_VINFO_VECTYPE (stmt_info);
2017
2018       if (known_alignment_for_access_p (dr0))
2019         {
2020           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2021                                                 size_zero_node) < 0;
2022           if (!npeel)
2023             {
2024               /* Since it's known at compile time, compute the number of
2025                  iterations in the peeled loop (the peeling factor) for use in
2026                  updating DR_MISALIGNMENT values.  The peeling factor is the
2027                  vectorization factor minus the misalignment as an element
2028                  count.  */
2029               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2030               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2031               npeel = ((mis & (target_align - 1))
2032                        / vect_get_scalar_dr_size (dr0));
2033             }
2034
2035           /* For interleaved data access every iteration accesses all the
2036              members of the group, therefore we divide the number of iterations
2037              by the group size.  */
2038           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2039           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2040             npeel /= DR_GROUP_SIZE (stmt_info);
2041
2042           if (dump_enabled_p ())
2043             dump_printf_loc (MSG_NOTE, vect_location,
2044                              "Try peeling by %d\n", npeel);
2045         }
2046
2047       /* Ensure that all datarefs can be vectorized after the peel.  */
2048       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2049         do_peeling = false;
2050
2051       /* Check if all datarefs are supportable and log.  */
2052       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2053         {
2054           stat = vect_verify_datarefs_alignment (loop_vinfo);
2055           if (!stat)
2056             do_peeling = false;
2057           else
2058             return stat;
2059         }
2060
2061       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2062       if (do_peeling)
2063         {
2064           unsigned max_allowed_peel
2065             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2066           if (max_allowed_peel != (unsigned)-1)
2067             {
2068               unsigned max_peel = npeel;
2069               if (max_peel == 0)
2070                 {
2071                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2072                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2073                 }
2074               if (max_peel > max_allowed_peel)
2075                 {
2076                   do_peeling = false;
2077                   if (dump_enabled_p ())
2078                     dump_printf_loc (MSG_NOTE, vect_location,
2079                         "Disable peeling, max peels reached: %d\n", max_peel);
2080                 }
2081             }
2082         }
2083
2084       /* Cost model #2 - if peeling may result in a remaining loop not
2085          iterating enough to be vectorized then do not peel.  Since this
2086          is a cost heuristic rather than a correctness decision, use the
2087          most likely runtime value for variable vectorization factors.  */
2088       if (do_peeling
2089           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2090         {
2091           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2092           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2093           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2094               < assumed_vf + max_peel)
2095             do_peeling = false;
2096         }
2097
2098       if (do_peeling)
2099         {
2100           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2101              If the misalignment of DR_i is identical to that of dr0 then set
2102              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2103              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2104              by the peeling factor times the element size of DR_i (MOD the
2105              vectorization factor times the size).  Otherwise, the
2106              misalignment of DR_i must be set to unknown.  */
2107           FOR_EACH_VEC_ELT (datarefs, i, dr)
2108             if (dr != dr0)
2109               {
2110                 /* Strided accesses perform only component accesses, alignment
2111                    is irrelevant for them.  */
2112                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2113                 if (STMT_VINFO_STRIDED_P (stmt_info)
2114                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115                   continue;
2116
2117                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2118               }
2119
2120           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2121           if (npeel)
2122             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2123           else
2124             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2125               = DR_MISALIGNMENT (dr0);
2126           SET_DR_MISALIGNMENT (dr0, 0);
2127           if (dump_enabled_p ())
2128             {
2129               dump_printf_loc (MSG_NOTE, vect_location,
2130                                "Alignment of access forced using peeling.\n");
2131               dump_printf_loc (MSG_NOTE, vect_location,
2132                                "Peeling for alignment will be applied.\n");
2133             }
2134
2135           /* The inside-loop cost will be accounted for in vectorizable_load
2136              and vectorizable_store correctly with adjusted alignments.
2137              Drop the body_cst_vec on the floor here.  */
2138           stat = vect_verify_datarefs_alignment (loop_vinfo);
2139           gcc_assert (stat);
2140           return stat;
2141         }
2142     }
2143
2144   /* (2) Versioning to force alignment.  */
2145
2146   /* Try versioning if:
2147      1) optimize loop for speed
2148      2) there is at least one unsupported misaligned data ref with an unknown
2149         misalignment, and
2150      3) all misaligned data refs with a known misalignment are supported, and
2151      4) the number of runtime alignment checks is within reason.  */
2152
2153   do_versioning =
2154         optimize_loop_nest_for_speed_p (loop)
2155         && (!loop->inner); /* FORNOW */
2156
2157   if (do_versioning)
2158     {
2159       FOR_EACH_VEC_ELT (datarefs, i, dr)
2160         {
2161           stmt = DR_STMT (dr);
2162           stmt_info = vinfo_for_stmt (stmt);
2163
2164           /* For interleaving, only the alignment of the first access
2165              matters.  */
2166           if (aligned_access_p (dr)
2167               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2168                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2169             continue;
2170
2171           if (STMT_VINFO_STRIDED_P (stmt_info))
2172             {
2173               /* Strided loads perform only component accesses, alignment is
2174                  irrelevant for them.  */
2175               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2176                 continue;
2177               do_versioning = false;
2178               break;
2179             }
2180
2181           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2182
2183           if (!supportable_dr_alignment)
2184             {
2185               gimple *stmt;
2186               int mask;
2187               tree vectype;
2188
2189               if (known_alignment_for_access_p (dr)
2190                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2191                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2192                 {
2193                   do_versioning = false;
2194                   break;
2195                 }
2196
2197               stmt = DR_STMT (dr);
2198               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2199               gcc_assert (vectype);
2200
2201               /* At present we don't support versioning for alignment
2202                  with variable VF, since there's no guarantee that the
2203                  VF is a power of two.  We could relax this if we added
2204                  a way of enforcing a power-of-two size.  */
2205               unsigned HOST_WIDE_INT size;
2206               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2207                 {
2208                   do_versioning = false;
2209                   break;
2210                 }
2211
2212               /* The rightmost bits of an aligned address must be zeros.
2213                  Construct the mask needed for this test.  For example,
2214                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2215                  mask must be 15 = 0xf. */
2216               mask = size - 1;
2217
2218               /* FORNOW: use the same mask to test all potentially unaligned
2219                  references in the loop.  The vectorizer currently supports
2220                  a single vector size, see the reference to
2221                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2222                  vectorization factor is computed.  */
2223               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2224                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2225               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2226               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2227                       DR_STMT (dr));
2228             }
2229         }
2230
2231       /* Versioning requires at least one misaligned data reference.  */
2232       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2233         do_versioning = false;
2234       else if (!do_versioning)
2235         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2236     }
2237
2238   if (do_versioning)
2239     {
2240       vec<gimple *> may_misalign_stmts
2241         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2242       gimple *stmt;
2243
2244       /* It can now be assumed that the data references in the statements
2245          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2246          of the loop being vectorized.  */
2247       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2248         {
2249           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2250           dr = STMT_VINFO_DATA_REF (stmt_info);
2251           SET_DR_MISALIGNMENT (dr, 0);
2252           if (dump_enabled_p ())
2253             dump_printf_loc (MSG_NOTE, vect_location,
2254                              "Alignment of access forced using versioning.\n");
2255         }
2256
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_NOTE, vect_location,
2259                          "Versioning for alignment will be applied.\n");
2260
2261       /* Peeling and versioning can't be done together at this time.  */
2262       gcc_assert (! (do_peeling && do_versioning));
2263
2264       stat = vect_verify_datarefs_alignment (loop_vinfo);
2265       gcc_assert (stat);
2266       return stat;
2267     }
2268
2269   /* This point is reached if neither peeling nor versioning is being done.  */
2270   gcc_assert (! (do_peeling || do_versioning));
2271
2272   stat = vect_verify_datarefs_alignment (loop_vinfo);
2273   return stat;
2274 }
2275
2276
2277 /* Function vect_find_same_alignment_drs.
2278
2279    Update group and alignment relations according to the chosen
2280    vectorization factor.  */
2281
2282 static void
2283 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2284 {
2285   struct data_reference *dra = DDR_A (ddr);
2286   struct data_reference *drb = DDR_B (ddr);
2287   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2288   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2289
2290   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2291     return;
2292
2293   if (dra == drb)
2294     return;
2295
2296   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2297       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2298       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2299     return;
2300
2301   /* Two references with distance zero have the same alignment.  */
2302   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2303                           - wi::to_poly_offset (DR_INIT (drb)));
2304   if (maybe_ne (diff, 0))
2305     {
2306       /* Get the wider of the two alignments.  */
2307       unsigned int align_a = (vect_calculate_target_alignment (dra)
2308                               / BITS_PER_UNIT);
2309       unsigned int align_b = (vect_calculate_target_alignment (drb)
2310                               / BITS_PER_UNIT);
2311       unsigned int max_align = MAX (align_a, align_b);
2312
2313       /* Require the gap to be a multiple of the larger vector alignment.  */
2314       if (!multiple_p (diff, max_align))
2315         return;
2316     }
2317
2318   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2319   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2320   if (dump_enabled_p ())
2321     {
2322       dump_printf_loc (MSG_NOTE, vect_location,
2323                        "accesses have the same alignment: ");
2324       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2325       dump_printf (MSG_NOTE,  " and ");
2326       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2327       dump_printf (MSG_NOTE, "\n");
2328     }
2329 }
2330
2331
2332 /* Function vect_analyze_data_refs_alignment
2333
2334    Analyze the alignment of the data-references in the loop.
2335    Return FALSE if a data reference is found that cannot be vectorized.  */
2336
2337 bool
2338 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2339 {
2340   if (dump_enabled_p ())
2341     dump_printf_loc (MSG_NOTE, vect_location,
2342                      "=== vect_analyze_data_refs_alignment ===\n");
2343
2344   /* Mark groups of data references with same alignment using
2345      data dependence information.  */
2346   vec<ddr_p> ddrs = vinfo->ddrs;
2347   struct data_dependence_relation *ddr;
2348   unsigned int i;
2349
2350   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2351     vect_find_same_alignment_drs (ddr);
2352
2353   vec<data_reference_p> datarefs = vinfo->datarefs;
2354   struct data_reference *dr;
2355
2356   vect_record_base_alignments (vinfo);
2357   FOR_EACH_VEC_ELT (datarefs, i, dr)
2358     {
2359       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2360       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2361           && !vect_compute_data_ref_alignment (dr))
2362         {
2363           /* Strided accesses perform only component accesses, misalignment
2364              information is irrelevant for them.  */
2365           if (STMT_VINFO_STRIDED_P (stmt_info)
2366               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2367             continue;
2368
2369           if (dump_enabled_p ())
2370             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371                              "not vectorized: can't calculate alignment "
2372                              "for data ref.\n");
2373
2374           return false;
2375         }
2376     }
2377
2378   return true;
2379 }
2380
2381
2382 /* Analyze alignment of DRs of stmts in NODE.  */
2383
2384 static bool
2385 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2386 {
2387   /* We vectorize from the first scalar stmt in the node unless
2388      the node is permuted in which case we start from the first
2389      element in the group.  */
2390   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2391   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2393     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2394
2395   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2396   if (! vect_compute_data_ref_alignment (dr)
2397       /* For creating the data-ref pointer we need alignment of the
2398          first element anyway.  */
2399       || (dr != first_dr
2400           && ! vect_compute_data_ref_alignment (first_dr))
2401       || ! verify_data_ref_alignment (dr))
2402     {
2403       if (dump_enabled_p ())
2404         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405                          "not vectorized: bad data alignment in basic "
2406                          "block.\n");
2407       return false;
2408     }
2409
2410   return true;
2411 }
2412
2413 /* Function vect_slp_analyze_instance_alignment
2414
2415    Analyze the alignment of the data-references in the SLP instance.
2416    Return FALSE if a data reference is found that cannot be vectorized.  */
2417
2418 bool
2419 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2420 {
2421   if (dump_enabled_p ())
2422     dump_printf_loc (MSG_NOTE, vect_location,
2423                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2424
2425   slp_tree node;
2426   unsigned i;
2427   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2428     if (! vect_slp_analyze_and_verify_node_alignment (node))
2429       return false;
2430
2431   node = SLP_INSTANCE_TREE (instance);
2432   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2433       && ! vect_slp_analyze_and_verify_node_alignment
2434              (SLP_INSTANCE_TREE (instance)))
2435     return false;
2436
2437   return true;
2438 }
2439
2440
2441 /* Analyze groups of accesses: check that DR belongs to a group of
2442    accesses of legal size, step, etc.  Detect gaps, single element
2443    interleaving, and other special cases. Set grouped access info.
2444    Collect groups of strided stores for further use in SLP analysis.
2445    Worker for vect_analyze_group_access.  */
2446
2447 static bool
2448 vect_analyze_group_access_1 (struct data_reference *dr)
2449 {
2450   tree step = DR_STEP (dr);
2451   tree scalar_type = TREE_TYPE (DR_REF (dr));
2452   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2453   gimple *stmt = DR_STMT (dr);
2454   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2455   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2456   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2457   HOST_WIDE_INT dr_step = -1;
2458   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2459   bool slp_impossible = false;
2460
2461   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2462      size of the interleaving group (including gaps).  */
2463   if (tree_fits_shwi_p (step))
2464     {
2465       dr_step = tree_to_shwi (step);
2466       /* Check that STEP is a multiple of type size.  Otherwise there is
2467          a non-element-sized gap at the end of the group which we
2468          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2469          ???  As we can handle non-constant step fine here we should
2470          simply remove uses of DR_GROUP_GAP between the last and first
2471          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2472          simply not include that gap.  */
2473       if ((dr_step % type_size) != 0)
2474         {
2475           if (dump_enabled_p ())
2476             {
2477               dump_printf_loc (MSG_NOTE, vect_location,
2478                                "Step ");
2479               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2480               dump_printf (MSG_NOTE,
2481                            " is not a multiple of the element size for ");
2482               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2483               dump_printf (MSG_NOTE, "\n");
2484             }
2485           return false;
2486         }
2487       groupsize = absu_hwi (dr_step) / type_size;
2488     }
2489   else
2490     groupsize = 0;
2491
2492   /* Not consecutive access is possible only if it is a part of interleaving.  */
2493   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2494     {
2495       /* Check if it this DR is a part of interleaving, and is a single
2496          element of the group that is accessed in the loop.  */
2497
2498       /* Gaps are supported only for loads. STEP must be a multiple of the type
2499          size.  */
2500       if (DR_IS_READ (dr)
2501           && (dr_step % type_size) == 0
2502           && groupsize > 0)
2503         {
2504           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2505           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2506           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2507           if (dump_enabled_p ())
2508             {
2509               dump_printf_loc (MSG_NOTE, vect_location,
2510                                "Detected single element interleaving ");
2511               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2512               dump_printf (MSG_NOTE, " step ");
2513               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2514               dump_printf (MSG_NOTE, "\n");
2515             }
2516
2517           return true;
2518         }
2519
2520       if (dump_enabled_p ())
2521         {
2522           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523                            "not consecutive access ");
2524           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2525         }
2526
2527       if (bb_vinfo)
2528         {
2529           /* Mark the statement as unvectorizable.  */
2530           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2531           return true;
2532         }
2533
2534       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2535       STMT_VINFO_STRIDED_P (stmt_info) = true;
2536       return true;
2537     }
2538
2539   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2540     {
2541       /* First stmt in the interleaving chain. Check the chain.  */
2542       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2543       struct data_reference *data_ref = dr;
2544       unsigned int count = 1;
2545       tree prev_init = DR_INIT (data_ref);
2546       gimple *prev = stmt;
2547       HOST_WIDE_INT diff, gaps = 0;
2548
2549       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2550       while (next)
2551         {
2552           /* Skip same data-refs.  In case that two or more stmts share
2553              data-ref (supported only for loads), we vectorize only the first
2554              stmt, and the rest get their vectorized loads from the first
2555              one.  */
2556           if (!tree_int_cst_compare (DR_INIT (data_ref),
2557                                      DR_INIT (STMT_VINFO_DATA_REF (
2558                                                    vinfo_for_stmt (next)))))
2559             {
2560               if (DR_IS_WRITE (data_ref))
2561                 {
2562                   if (dump_enabled_p ())
2563                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2564                                      "Two store stmts share the same dr.\n");
2565                   return false;
2566                 }
2567
2568               if (dump_enabled_p ())
2569                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2570                                  "Two or more load stmts share the same dr.\n");
2571
2572               /* For load use the same data-ref load.  */
2573               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2574
2575               prev = next;
2576               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2577               continue;
2578             }
2579
2580           prev = next;
2581           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2582
2583           /* All group members have the same STEP by construction.  */
2584           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2585
2586           /* Check that the distance between two accesses is equal to the type
2587              size. Otherwise, we have gaps.  */
2588           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2589                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2590           if (diff != 1)
2591             {
2592               /* FORNOW: SLP of accesses with gaps is not supported.  */
2593               slp_impossible = true;
2594               if (DR_IS_WRITE (data_ref))
2595                 {
2596                   if (dump_enabled_p ())
2597                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2598                                      "interleaved store with gaps\n");
2599                   return false;
2600                 }
2601
2602               gaps += diff - 1;
2603             }
2604
2605           last_accessed_element += diff;
2606
2607           /* Store the gap from the previous member of the group. If there is no
2608              gap in the access, DR_GROUP_GAP is always 1.  */
2609           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2610
2611           prev_init = DR_INIT (data_ref);
2612           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2613           /* Count the number of data-refs in the chain.  */
2614           count++;
2615         }
2616
2617       if (groupsize == 0)
2618         groupsize = count + gaps;
2619
2620       /* This could be UINT_MAX but as we are generating code in a very
2621          inefficient way we have to cap earlier.  See PR78699 for example.  */
2622       if (groupsize > 4096)
2623         {
2624           if (dump_enabled_p ())
2625             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626                              "group is too large\n");
2627           return false;
2628         }
2629
2630       /* Check that the size of the interleaving is equal to count for stores,
2631          i.e., that there are no gaps.  */
2632       if (groupsize != count
2633           && !DR_IS_READ (dr))
2634         {
2635           if (dump_enabled_p ())
2636             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637                              "interleaved store with gaps\n");
2638           return false;
2639         }
2640
2641       /* If there is a gap after the last load in the group it is the
2642          difference between the groupsize and the last accessed
2643          element.
2644          When there is no gap, this difference should be 0.  */
2645       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2646
2647       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2648       if (dump_enabled_p ())
2649         {
2650           dump_printf_loc (MSG_NOTE, vect_location,
2651                            "Detected interleaving ");
2652           if (DR_IS_READ (dr))
2653             dump_printf (MSG_NOTE, "load ");
2654           else
2655             dump_printf (MSG_NOTE, "store ");
2656           dump_printf (MSG_NOTE, "of size %u starting with ",
2657                        (unsigned)groupsize);
2658           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2659           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2660             dump_printf_loc (MSG_NOTE, vect_location,
2661                              "There is a gap of %u elements after the group\n",
2662                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2663         }
2664
2665       /* SLP: create an SLP data structure for every interleaving group of
2666          stores for further analysis in vect_analyse_slp.  */
2667       if (DR_IS_WRITE (dr) && !slp_impossible)
2668         {
2669           if (loop_vinfo)
2670             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2671           if (bb_vinfo)
2672             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2673         }
2674     }
2675
2676   return true;
2677 }
2678
2679 /* Analyze groups of accesses: check that DR belongs to a group of
2680    accesses of legal size, step, etc.  Detect gaps, single element
2681    interleaving, and other special cases. Set grouped access info.
2682    Collect groups of strided stores for further use in SLP analysis.  */
2683
2684 static bool
2685 vect_analyze_group_access (struct data_reference *dr)
2686 {
2687   if (!vect_analyze_group_access_1 (dr))
2688     {
2689       /* Dissolve the group if present.  */
2690       gimple *next;
2691       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2692       while (stmt)
2693         {
2694           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2695           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2696           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2697           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2698           stmt = next;
2699         }
2700       return false;
2701     }
2702   return true;
2703 }
2704
2705 /* Analyze the access pattern of the data-reference DR.
2706    In case of non-consecutive accesses call vect_analyze_group_access() to
2707    analyze groups of accesses.  */
2708
2709 static bool
2710 vect_analyze_data_ref_access (struct data_reference *dr)
2711 {
2712   tree step = DR_STEP (dr);
2713   tree scalar_type = TREE_TYPE (DR_REF (dr));
2714   gimple *stmt = DR_STMT (dr);
2715   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2716   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2717   struct loop *loop = NULL;
2718
2719   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2720     return true;
2721
2722   if (loop_vinfo)
2723     loop = LOOP_VINFO_LOOP (loop_vinfo);
2724
2725   if (loop_vinfo && !step)
2726     {
2727       if (dump_enabled_p ())
2728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729                          "bad data-ref access in loop\n");
2730       return false;
2731     }
2732
2733   /* Allow loads with zero step in inner-loop vectorization.  */
2734   if (loop_vinfo && integer_zerop (step))
2735     {
2736       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2737       if (!nested_in_vect_loop_p (loop, stmt))
2738         return DR_IS_READ (dr);
2739       /* Allow references with zero step for outer loops marked
2740          with pragma omp simd only - it guarantees absence of
2741          loop-carried dependencies between inner loop iterations.  */
2742       if (loop->safelen < 2)
2743         {
2744           if (dump_enabled_p ())
2745             dump_printf_loc (MSG_NOTE, vect_location,
2746                              "zero step in inner loop of nest\n");
2747           return false;
2748         }
2749     }
2750
2751   if (loop && nested_in_vect_loop_p (loop, stmt))
2752     {
2753       /* Interleaved accesses are not yet supported within outer-loop
2754         vectorization for references in the inner-loop.  */
2755       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2756
2757       /* For the rest of the analysis we use the outer-loop step.  */
2758       step = STMT_VINFO_DR_STEP (stmt_info);
2759       if (integer_zerop (step))
2760         {
2761           if (dump_enabled_p ())
2762             dump_printf_loc (MSG_NOTE, vect_location,
2763                              "zero step in outer loop.\n");
2764           return DR_IS_READ (dr);
2765         }
2766     }
2767
2768   /* Consecutive?  */
2769   if (TREE_CODE (step) == INTEGER_CST)
2770     {
2771       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2772       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2773           || (dr_step < 0
2774               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2775         {
2776           /* Mark that it is not interleaving.  */
2777           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2778           return true;
2779         }
2780     }
2781
2782   if (loop && nested_in_vect_loop_p (loop, stmt))
2783     {
2784       if (dump_enabled_p ())
2785         dump_printf_loc (MSG_NOTE, vect_location,
2786                          "grouped access in outer loop.\n");
2787       return false;
2788     }
2789
2790
2791   /* Assume this is a DR handled by non-constant strided load case.  */
2792   if (TREE_CODE (step) != INTEGER_CST)
2793     return (STMT_VINFO_STRIDED_P (stmt_info)
2794             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2795                 || vect_analyze_group_access (dr)));
2796
2797   /* Not consecutive access - check if it's a part of interleaving group.  */
2798   return vect_analyze_group_access (dr);
2799 }
2800
2801 /* Compare two data-references DRA and DRB to group them into chunks
2802    suitable for grouping.  */
2803
2804 static int
2805 dr_group_sort_cmp (const void *dra_, const void *drb_)
2806 {
2807   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2808   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2809   int cmp;
2810
2811   /* Stabilize sort.  */
2812   if (dra == drb)
2813     return 0;
2814
2815   /* DRs in different loops never belong to the same group.  */
2816   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2817   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2818   if (loopa != loopb)
2819     return loopa->num < loopb->num ? -1 : 1;
2820
2821   /* Ordering of DRs according to base.  */
2822   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2823                                DR_BASE_ADDRESS (drb));
2824   if (cmp != 0)
2825     return cmp;
2826
2827   /* And according to DR_OFFSET.  */
2828   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2829   if (cmp != 0)
2830     return cmp;
2831
2832   /* Put reads before writes.  */
2833   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2834     return DR_IS_READ (dra) ? -1 : 1;
2835
2836   /* Then sort after access size.  */
2837   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2838                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2839   if (cmp != 0)
2840     return cmp;
2841
2842   /* And after step.  */
2843   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2844   if (cmp != 0)
2845     return cmp;
2846
2847   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2848   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2849   if (cmp == 0)
2850     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2851   return cmp;
2852 }
2853
2854 /* If OP is the result of a conversion, return the unconverted value,
2855    otherwise return null.  */
2856
2857 static tree
2858 strip_conversion (tree op)
2859 {
2860   if (TREE_CODE (op) != SSA_NAME)
2861     return NULL_TREE;
2862   gimple *stmt = SSA_NAME_DEF_STMT (op);
2863   if (!is_gimple_assign (stmt)
2864       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2865     return NULL_TREE;
2866   return gimple_assign_rhs1 (stmt);
2867 }
2868
2869 /* Return true if vectorizable_* routines can handle statements STMT1
2870    and STMT2 being in a single group.  */
2871
2872 static bool
2873 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2874 {
2875   if (gimple_assign_single_p (stmt1))
2876     return gimple_assign_single_p (stmt2);
2877
2878   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2879     {
2880       /* Check for two masked loads or two masked stores.  */
2881       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2882         return false;
2883       internal_fn ifn = gimple_call_internal_fn (stmt1);
2884       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885         return false;
2886       if (ifn != gimple_call_internal_fn (stmt2))
2887         return false;
2888
2889       /* Check that the masks are the same.  Cope with casts of masks,
2890          like those created by build_mask_conversion.  */
2891       tree mask1 = gimple_call_arg (stmt1, 2);
2892       tree mask2 = gimple_call_arg (stmt2, 2);
2893       if (!operand_equal_p (mask1, mask2, 0))
2894         {
2895           mask1 = strip_conversion (mask1);
2896           if (!mask1)
2897             return false;
2898           mask2 = strip_conversion (mask2);
2899           if (!mask2)
2900             return false;
2901           if (!operand_equal_p (mask1, mask2, 0))
2902             return false;
2903         }
2904       return true;
2905     }
2906
2907   return false;
2908 }
2909
2910 /* Function vect_analyze_data_ref_accesses.
2911
2912    Analyze the access pattern of all the data references in the loop.
2913
2914    FORNOW: the only access pattern that is considered vectorizable is a
2915            simple step 1 (consecutive) access.
2916
2917    FORNOW: handle only arrays and pointer accesses.  */
2918
2919 bool
2920 vect_analyze_data_ref_accesses (vec_info *vinfo)
2921 {
2922   unsigned int i;
2923   vec<data_reference_p> datarefs = vinfo->datarefs;
2924   struct data_reference *dr;
2925
2926   if (dump_enabled_p ())
2927     dump_printf_loc (MSG_NOTE, vect_location,
2928                      "=== vect_analyze_data_ref_accesses ===\n");
2929
2930   if (datarefs.is_empty ())
2931     return true;
2932
2933   /* Sort the array of datarefs to make building the interleaving chains
2934      linear.  Don't modify the original vector's order, it is needed for
2935      determining what dependencies are reversed.  */
2936   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2937   datarefs_copy.qsort (dr_group_sort_cmp);
2938
2939   /* Build the interleaving chains.  */
2940   for (i = 0; i < datarefs_copy.length () - 1;)
2941     {
2942       data_reference_p dra = datarefs_copy[i];
2943       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2944       stmt_vec_info lastinfo = NULL;
2945       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2946           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2947         {
2948           ++i;
2949           continue;
2950         }
2951       for (i = i + 1; i < datarefs_copy.length (); ++i)
2952         {
2953           data_reference_p drb = datarefs_copy[i];
2954           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2955           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2956               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2957             break;
2958
2959           /* ???  Imperfect sorting (non-compatible types, non-modulo
2960              accesses, same accesses) can lead to a group to be artificially
2961              split here as we don't just skip over those.  If it really
2962              matters we can push those to a worklist and re-iterate
2963              over them.  The we can just skip ahead to the next DR here.  */
2964
2965           /* DRs in a different loop should not be put into the same
2966              interleaving group.  */
2967           if (gimple_bb (DR_STMT (dra))->loop_father
2968               != gimple_bb (DR_STMT (drb))->loop_father)
2969             break;
2970
2971           /* Check that the data-refs have same first location (except init)
2972              and they are both either store or load (not load and store,
2973              not masked loads or stores).  */
2974           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2975               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2976                                         DR_BASE_ADDRESS (drb)) != 0
2977               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2978               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2979             break;
2980
2981           /* Check that the data-refs have the same constant size.  */
2982           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2983           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2984           if (!tree_fits_uhwi_p (sza)
2985               || !tree_fits_uhwi_p (szb)
2986               || !tree_int_cst_equal (sza, szb))
2987             break;
2988
2989           /* Check that the data-refs have the same step.  */
2990           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2991             break;
2992
2993           /* Check the types are compatible.
2994              ???  We don't distinguish this during sorting.  */
2995           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2996                                    TREE_TYPE (DR_REF (drb))))
2997             break;
2998
2999           /* Check that the DR_INITs are compile-time constants.  */
3000           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3001               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3002             break;
3003
3004           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3005           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3006           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3007           HOST_WIDE_INT init_prev
3008             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3009           gcc_assert (init_a <= init_b
3010                       && init_a <= init_prev
3011                       && init_prev <= init_b);
3012
3013           /* Do not place the same access in the interleaving chain twice.  */
3014           if (init_b == init_prev)
3015             {
3016               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3017                           < gimple_uid (DR_STMT (drb)));
3018               /* ???  For now we simply "drop" the later reference which is
3019                  otherwise the same rather than finishing off this group.
3020                  In the end we'd want to re-process duplicates forming
3021                  multiple groups from the refs, likely by just collecting
3022                  all candidates (including duplicates and split points
3023                  below) in a vector and then process them together.  */
3024               continue;
3025             }
3026
3027           /* If init_b == init_a + the size of the type * k, we have an
3028              interleaving, and DRA is accessed before DRB.  */
3029           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3030           if (type_size_a == 0
3031               || (init_b - init_a) % type_size_a != 0)
3032             break;
3033
3034           /* If we have a store, the accesses are adjacent.  This splits
3035              groups into chunks we support (we don't support vectorization
3036              of stores with gaps).  */
3037           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3038             break;
3039
3040           /* If the step (if not zero or non-constant) is greater than the
3041              difference between data-refs' inits this splits groups into
3042              suitable sizes.  */
3043           if (tree_fits_shwi_p (DR_STEP (dra)))
3044             {
3045               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3046               if (step != 0 && step <= (init_b - init_a))
3047                 break;
3048             }
3049
3050           if (dump_enabled_p ())
3051             {
3052               dump_printf_loc (MSG_NOTE, vect_location,
3053                                "Detected interleaving ");
3054               if (DR_IS_READ (dra))
3055                 dump_printf (MSG_NOTE, "load ");
3056               else
3057                 dump_printf (MSG_NOTE, "store ");
3058               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3059               dump_printf (MSG_NOTE,  " and ");
3060               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3061               dump_printf (MSG_NOTE, "\n");
3062             }
3063
3064           /* Link the found element into the group list.  */
3065           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3066             {
3067               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3068               lastinfo = stmtinfo_a;
3069             }
3070           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3071           DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3072           lastinfo = stmtinfo_b;
3073         }
3074     }
3075
3076   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3077     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3078         && !vect_analyze_data_ref_access (dr))
3079       {
3080         if (dump_enabled_p ())
3081           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3082                            "not vectorized: complicated access pattern.\n");
3083
3084         if (is_a <bb_vec_info> (vinfo))
3085           {
3086             /* Mark the statement as not vectorizable.  */
3087             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3088             continue;
3089           }
3090         else
3091           {
3092             datarefs_copy.release ();
3093             return false;
3094           }
3095       }
3096
3097   datarefs_copy.release ();
3098   return true;
3099 }
3100
3101 /* Function vect_vfa_segment_size.
3102
3103    Input:
3104      DR: The data reference.
3105      LENGTH_FACTOR: segment length to consider.
3106
3107    Return a value suitable for the dr_with_seg_len::seg_len field.
3108    This is the "distance travelled" by the pointer from the first
3109    iteration in the segment to the last.  Note that it does not include
3110    the size of the access; in effect it only describes the first byte.  */
3111
3112 static tree
3113 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3114 {
3115   length_factor = size_binop (MINUS_EXPR,
3116                               fold_convert (sizetype, length_factor),
3117                               size_one_node);
3118   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3119                      length_factor);
3120 }
3121
3122 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3123    gives the worst-case number of bytes covered by the segment.  */
3124
3125 static unsigned HOST_WIDE_INT
3126 vect_vfa_access_size (data_reference *dr)
3127 {
3128   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3129   tree ref_type = TREE_TYPE (DR_REF (dr));
3130   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3131   unsigned HOST_WIDE_INT access_size = ref_size;
3132   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3133     {
3134       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3135       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3136     }
3137   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3138       && (vect_supportable_dr_alignment (dr, false)
3139           == dr_explicit_realign_optimized))
3140     {
3141       /* We might access a full vector's worth.  */
3142       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3143       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3144     }
3145   return access_size;
3146 }
3147
3148 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3149
3150 static unsigned int
3151 vect_vfa_align (const data_reference *dr)
3152 {
3153   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3154 }
3155
3156 /* Function vect_no_alias_p.
3157
3158    Given data references A and B with equal base and offset, see whether
3159    the alias relation can be decided at compilation time.  Return 1 if
3160    it can and the references alias, 0 if it can and the references do
3161    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3162    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3163    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3164
3165 static int
3166 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3167                          tree segment_length_a, tree segment_length_b,
3168                          unsigned HOST_WIDE_INT access_size_a,
3169                          unsigned HOST_WIDE_INT access_size_b)
3170 {
3171   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3172   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3173   poly_uint64 const_length_a;
3174   poly_uint64 const_length_b;
3175
3176   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3177      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3178      [a, a+12) */
3179   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3180     {
3181       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3182       offset_a = (offset_a + access_size_a) - const_length_a;
3183     }
3184   else
3185     const_length_a = tree_to_poly_uint64 (segment_length_a);
3186   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3187     {
3188       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3189       offset_b = (offset_b + access_size_b) - const_length_b;
3190     }
3191   else
3192     const_length_b = tree_to_poly_uint64 (segment_length_b);
3193
3194   const_length_a += access_size_a;
3195   const_length_b += access_size_b;
3196
3197   if (ranges_known_overlap_p (offset_a, const_length_a,
3198                               offset_b, const_length_b))
3199     return 1;
3200
3201   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3202                                offset_b, const_length_b))
3203     return 0;
3204
3205   return -1;
3206 }
3207
3208 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3209    in DDR is >= VF.  */
3210
3211 static bool
3212 dependence_distance_ge_vf (data_dependence_relation *ddr,
3213                            unsigned int loop_depth, poly_uint64 vf)
3214 {
3215   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3216       || DDR_NUM_DIST_VECTS (ddr) == 0)
3217     return false;
3218
3219   /* If the dependence is exact, we should have limited the VF instead.  */
3220   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3221
3222   unsigned int i;
3223   lambda_vector dist_v;
3224   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3225     {
3226       HOST_WIDE_INT dist = dist_v[loop_depth];
3227       if (dist != 0
3228           && !(dist > 0 && DDR_REVERSED_P (ddr))
3229           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3230         return false;
3231     }
3232
3233   if (dump_enabled_p ())
3234     {
3235       dump_printf_loc (MSG_NOTE, vect_location,
3236                        "dependence distance between ");
3237       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3238       dump_printf (MSG_NOTE,  " and ");
3239       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3240       dump_printf (MSG_NOTE,  " is >= VF\n");
3241     }
3242
3243   return true;
3244 }
3245
3246 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3247
3248 static void
3249 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3250 {
3251   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3252   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3253   dump_printf (dump_kind, ") >= ");
3254   dump_dec (dump_kind, lower_bound.min_value);
3255 }
3256
3257 /* Record that the vectorized loop requires the vec_lower_bound described
3258    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3259
3260 static void
3261 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3262                         poly_uint64 min_value)
3263 {
3264   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3265   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3266     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3267       {
3268         unsigned_p &= lower_bounds[i].unsigned_p;
3269         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3270         if (lower_bounds[i].unsigned_p != unsigned_p
3271             || maybe_lt (lower_bounds[i].min_value, min_value))
3272           {
3273             lower_bounds[i].unsigned_p = unsigned_p;
3274             lower_bounds[i].min_value = min_value;
3275             if (dump_enabled_p ())
3276               {
3277                 dump_printf_loc (MSG_NOTE, vect_location,
3278                                  "updating run-time check to ");
3279                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3280                 dump_printf (MSG_NOTE, "\n");
3281               }
3282           }
3283         return;
3284       }
3285
3286   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3287   if (dump_enabled_p ())
3288     {
3289       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3290       dump_lower_bound (MSG_NOTE, lower_bound);
3291       dump_printf (MSG_NOTE, "\n");
3292     }
3293   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3294 }
3295
3296 /* Return true if it's unlikely that the step of the vectorized form of DR
3297    will span fewer than GAP bytes.  */
3298
3299 static bool
3300 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3301 {
3302   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3303   HOST_WIDE_INT count
3304     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3305   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3306     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3307   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3308 }
3309
3310 /* Return true if we know that there is no alias between DR_A and DR_B
3311    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3312    *LOWER_BOUND_OUT to this N.  */
3313
3314 static bool
3315 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3316                                 poly_uint64 *lower_bound_out)
3317 {
3318   /* Check that there is a constant gap of known sign between DR_A
3319      and DR_B.  */
3320   poly_int64 init_a, init_b;
3321   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3322       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3323       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3324       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3325       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3326       || !ordered_p (init_a, init_b))
3327     return false;
3328
3329   /* Sort DR_A and DR_B by the address they access.  */
3330   if (maybe_lt (init_b, init_a))
3331     {
3332       std::swap (init_a, init_b);
3333       std::swap (dr_a, dr_b);
3334     }
3335
3336   /* If the two accesses could be dependent within a scalar iteration,
3337      make sure that we'd retain their order.  */
3338   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3339       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3340     return false;
3341
3342   /* There is no alias if abs (DR_STEP) is greater than or equal to
3343      the bytes spanned by the combination of the two accesses.  */
3344   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3345   return true;
3346 }
3347
3348 /* Function vect_prune_runtime_alias_test_list.
3349
3350    Prune a list of ddrs to be tested at run-time by versioning for alias.
3351    Merge several alias checks into one if possible.
3352    Return FALSE if resulting list of ddrs is longer then allowed by
3353    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3354
3355 bool
3356 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3357 {
3358   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3359   hash_set <tree_pair_hash> compared_objects;
3360
3361   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3362   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3363     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3364   vec<vec_object_pair> &check_unequal_addrs
3365     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3366   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3367   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3368
3369   ddr_p ddr;
3370   unsigned int i;
3371   tree length_factor;
3372
3373   if (dump_enabled_p ())
3374     dump_printf_loc (MSG_NOTE, vect_location,
3375                      "=== vect_prune_runtime_alias_test_list ===\n");
3376
3377   /* Step values are irrelevant for aliasing if the number of vector
3378      iterations is equal to the number of scalar iterations (which can
3379      happen for fully-SLP loops).  */
3380   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3381
3382   if (!ignore_step_p)
3383     {
3384       /* Convert the checks for nonzero steps into bound tests.  */
3385       tree value;
3386       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3387         vect_check_lower_bound (loop_vinfo, value, true, 1);
3388     }
3389
3390   if (may_alias_ddrs.is_empty ())
3391     return true;
3392
3393   comp_alias_ddrs.create (may_alias_ddrs.length ());
3394
3395   unsigned int loop_depth
3396     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3397                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3398
3399   /* First, we collect all data ref pairs for aliasing checks.  */
3400   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3401     {
3402       int comp_res;
3403       poly_uint64 lower_bound;
3404       struct data_reference *dr_a, *dr_b;
3405       gimple *dr_group_first_a, *dr_group_first_b;
3406       tree segment_length_a, segment_length_b;
3407       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3408       unsigned int align_a, align_b;
3409       gimple *stmt_a, *stmt_b;
3410
3411       /* Ignore the alias if the VF we chose ended up being no greater
3412          than the dependence distance.  */
3413       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3414         continue;
3415
3416       if (DDR_OBJECT_A (ddr))
3417         {
3418           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3419           if (!compared_objects.add (new_pair))
3420             {
3421               if (dump_enabled_p ())
3422                 {
3423                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3424                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3425                   dump_printf (MSG_NOTE, " and ");
3426                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3427                   dump_printf (MSG_NOTE, " have different addresses\n");
3428                 }
3429               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3430             }
3431           continue;
3432         }
3433
3434       dr_a = DDR_A (ddr);
3435       stmt_a = DR_STMT (DDR_A (ddr));
3436
3437       dr_b = DDR_B (ddr);
3438       stmt_b = DR_STMT (DDR_B (ddr));
3439
3440       /* Skip the pair if inter-iteration dependencies are irrelevant
3441          and intra-iteration dependencies are guaranteed to be honored.  */
3442       if (ignore_step_p
3443           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3444               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3445         {
3446           if (dump_enabled_p ())
3447             {
3448               dump_printf_loc (MSG_NOTE, vect_location,
3449                                "no need for alias check between ");
3450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3451               dump_printf (MSG_NOTE, " and ");
3452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3453               dump_printf (MSG_NOTE, " when VF is 1\n");
3454             }
3455           continue;
3456         }
3457
3458       /* See whether we can handle the alias using a bounds check on
3459          the step, and whether that's likely to be the best approach.
3460          (It might not be, for example, if the minimum step is much larger
3461          than the number of bytes handled by one vector iteration.)  */
3462       if (!ignore_step_p
3463           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3464           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3465           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3466               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3467         {
3468           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3469           if (dump_enabled_p ())
3470             {
3471               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3472               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3473               dump_printf (MSG_NOTE, " and ");
3474               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3475               dump_printf (MSG_NOTE, " when the step ");
3476               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3477               dump_printf (MSG_NOTE, " is outside ");
3478               if (unsigned_p)
3479                 dump_printf (MSG_NOTE, "[0");
3480               else
3481                 {
3482                   dump_printf (MSG_NOTE, "(");
3483                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3484                 }
3485               dump_printf (MSG_NOTE, ", ");
3486               dump_dec (MSG_NOTE, lower_bound);
3487               dump_printf (MSG_NOTE, ")\n");
3488             }
3489           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3490                                   lower_bound);
3491           continue;
3492         }
3493
3494       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3495       if (dr_group_first_a)
3496         {
3497           stmt_a = dr_group_first_a;
3498           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3499         }
3500
3501       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3502       if (dr_group_first_b)
3503         {
3504           stmt_b = dr_group_first_b;
3505           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3506         }
3507
3508       if (ignore_step_p)
3509         {
3510           segment_length_a = size_zero_node;
3511           segment_length_b = size_zero_node;
3512         }
3513       else
3514         {
3515           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3516             length_factor = scalar_loop_iters;
3517           else
3518             length_factor = size_int (vect_factor);
3519           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3520           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3521         }
3522       access_size_a = vect_vfa_access_size (dr_a);
3523       access_size_b = vect_vfa_access_size (dr_b);
3524       align_a = vect_vfa_align (dr_a);
3525       align_b = vect_vfa_align (dr_b);
3526
3527       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3528                                         DR_BASE_ADDRESS (dr_b));
3529       if (comp_res == 0)
3530         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3531                                           DR_OFFSET (dr_b));
3532
3533       /* See whether the alias is known at compilation time.  */
3534       if (comp_res == 0
3535           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3536           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3537           && poly_int_tree_p (segment_length_a)
3538           && poly_int_tree_p (segment_length_b))
3539         {
3540           int res = vect_compile_time_alias (dr_a, dr_b,
3541                                              segment_length_a,
3542                                              segment_length_b,
3543                                              access_size_a,
3544                                              access_size_b);
3545           if (res >= 0 && dump_enabled_p ())
3546             {
3547               dump_printf_loc (MSG_NOTE, vect_location,
3548                                "can tell at compile time that ");
3549               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3550               dump_printf (MSG_NOTE, " and ");
3551               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3552               if (res == 0)
3553                 dump_printf (MSG_NOTE, " do not alias\n");
3554               else
3555                 dump_printf (MSG_NOTE, " alias\n");
3556             }
3557
3558           if (res == 0)
3559             continue;
3560
3561           if (res == 1)
3562             {
3563               if (dump_enabled_p ())
3564                 dump_printf_loc (MSG_NOTE, vect_location,
3565                                  "not vectorized: compilation time alias.\n");
3566               return false;
3567             }
3568         }
3569
3570       dr_with_seg_len_pair_t dr_with_seg_len_pair
3571         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3572          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3573
3574       /* Canonicalize pairs by sorting the two DR members.  */
3575       if (comp_res > 0)
3576         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3577
3578       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3579     }
3580
3581   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3582
3583   unsigned int count = (comp_alias_ddrs.length ()
3584                         + check_unequal_addrs.length ());
3585
3586   dump_printf_loc (MSG_NOTE, vect_location,
3587                    "improved number of alias checks from %d to %d\n",
3588                    may_alias_ddrs.length (), count);
3589   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3590     {
3591       if (dump_enabled_p ())
3592         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3593                          "number of versioning for alias "
3594                          "run-time tests exceeds %d "
3595                          "(--param vect-max-version-for-alias-checks)\n",
3596                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3597       return false;
3598     }
3599
3600   return true;
3601 }
3602
3603 /* Check whether we can use an internal function for a gather load
3604    or scatter store.  READ_P is true for loads and false for stores.
3605    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3606    the type of the memory elements being loaded or stored.  OFFSET_BITS
3607    is the number of bits in each scalar offset and OFFSET_SIGN is the
3608    sign of the offset.  SCALE is the amount by which the offset should
3609    be multiplied *after* it has been converted to address width.
3610
3611    Return true if the function is supported, storing the function
3612    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3613
3614 bool
3615 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3616                           tree memory_type, unsigned int offset_bits,
3617                           signop offset_sign, int scale,
3618                           internal_fn *ifn_out, tree *element_type_out)
3619 {
3620   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3621   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3622   if (offset_bits > element_bits)
3623     /* Internal functions require the offset to be the same width as
3624        the vector elements.  We can extend narrower offsets, but it isn't
3625        safe to truncate wider offsets.  */
3626     return false;
3627
3628   if (element_bits != memory_bits)
3629     /* For now the vector elements must be the same width as the
3630        memory elements.  */
3631     return false;
3632
3633   /* Work out which function we need.  */
3634   internal_fn ifn;
3635   if (read_p)
3636     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3637   else
3638     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3639
3640   /* Test whether the target supports this combination.  */
3641   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3642                                                offset_sign, scale))
3643     return false;
3644
3645   *ifn_out = ifn;
3646   *element_type_out = TREE_TYPE (vectype);
3647   return true;
3648 }
3649
3650 /* CALL is a call to an internal gather load or scatter store function.
3651    Describe the operation in INFO.  */
3652
3653 static void
3654 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3655 {
3656   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3657   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3658   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3659
3660   info->ifn = gimple_call_internal_fn (call);
3661   info->decl = NULL_TREE;
3662   info->base = gimple_call_arg (call, 0);
3663   info->offset = gimple_call_arg (call, 1);
3664   info->offset_dt = vect_unknown_def_type;
3665   info->offset_vectype = NULL_TREE;
3666   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3667   info->element_type = TREE_TYPE (vectype);
3668   info->memory_type = TREE_TYPE (DR_REF (dr));
3669 }
3670
3671 /* Return true if a non-affine read or write in STMT is suitable for a
3672    gather load or scatter store.  Describe the operation in *INFO if so.  */
3673
3674 bool
3675 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3676                            gather_scatter_info *info)
3677 {
3678   HOST_WIDE_INT scale = 1;
3679   poly_int64 pbitpos, pbitsize;
3680   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3681   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3682   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3683   tree offtype = NULL_TREE;
3684   tree decl = NULL_TREE, base, off;
3685   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3686   tree memory_type = TREE_TYPE (DR_REF (dr));
3687   machine_mode pmode;
3688   int punsignedp, reversep, pvolatilep = 0;
3689   internal_fn ifn;
3690   tree element_type;
3691   bool masked_p = false;
3692
3693   /* See whether this is already a call to a gather/scatter internal function.
3694      If not, see whether it's a masked load or store.  */
3695   gcall *call = dyn_cast <gcall *> (stmt);
3696   if (call && gimple_call_internal_p (call))
3697     {
3698       ifn = gimple_call_internal_fn (stmt);
3699       if (internal_gather_scatter_fn_p (ifn))
3700         {
3701           vect_describe_gather_scatter_call (call, info);
3702           return true;
3703         }
3704       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3705     }
3706
3707   /* True if we should aim to use internal functions rather than
3708      built-in functions.  */
3709   bool use_ifn_p = (DR_IS_READ (dr)
3710                     ? supports_vec_gather_load_p ()
3711                     : supports_vec_scatter_store_p ());
3712
3713   base = DR_REF (dr);
3714   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3715      see if we can use the def stmt of the address.  */
3716   if (masked_p
3717       && TREE_CODE (base) == MEM_REF
3718       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3719       && integer_zerop (TREE_OPERAND (base, 1))
3720       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3721     {
3722       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3723       if (is_gimple_assign (def_stmt)
3724           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3725         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3726     }
3727
3728   /* The gather and scatter builtins need address of the form
3729      loop_invariant + vector * {1, 2, 4, 8}
3730      or
3731      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3732      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3733      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3734      multiplications and additions in it.  To get a vector, we need
3735      a single SSA_NAME that will be defined in the loop and will
3736      contain everything that is not loop invariant and that can be
3737      vectorized.  The following code attempts to find such a preexistng
3738      SSA_NAME OFF and put the loop invariants into a tree BASE
3739      that can be gimplified before the loop.  */
3740   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3741                               &punsignedp, &reversep, &pvolatilep);
3742   gcc_assert (base && !reversep);
3743   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3744
3745   if (TREE_CODE (base) == MEM_REF)
3746     {
3747       if (!integer_zerop (TREE_OPERAND (base, 1)))
3748         {
3749           if (off == NULL_TREE)
3750             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3751           else
3752             off = size_binop (PLUS_EXPR, off,
3753                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3754         }
3755       base = TREE_OPERAND (base, 0);
3756     }
3757   else
3758     base = build_fold_addr_expr (base);
3759
3760   if (off == NULL_TREE)
3761     off = size_zero_node;
3762
3763   /* If base is not loop invariant, either off is 0, then we start with just
3764      the constant offset in the loop invariant BASE and continue with base
3765      as OFF, otherwise give up.
3766      We could handle that case by gimplifying the addition of base + off
3767      into some SSA_NAME and use that as off, but for now punt.  */
3768   if (!expr_invariant_in_loop_p (loop, base))
3769     {
3770       if (!integer_zerop (off))
3771         return false;
3772       off = base;
3773       base = size_int (pbytepos);
3774     }
3775   /* Otherwise put base + constant offset into the loop invariant BASE
3776      and continue with OFF.  */
3777   else
3778     {
3779       base = fold_convert (sizetype, base);
3780       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3781     }
3782
3783   /* OFF at this point may be either a SSA_NAME or some tree expression
3784      from get_inner_reference.  Try to peel off loop invariants from it
3785      into BASE as long as possible.  */
3786   STRIP_NOPS (off);
3787   while (offtype == NULL_TREE)
3788     {
3789       enum tree_code code;
3790       tree op0, op1, add = NULL_TREE;
3791
3792       if (TREE_CODE (off) == SSA_NAME)
3793         {
3794           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3795
3796           if (expr_invariant_in_loop_p (loop, off))
3797             return false;
3798
3799           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3800             break;
3801
3802           op0 = gimple_assign_rhs1 (def_stmt);
3803           code = gimple_assign_rhs_code (def_stmt);
3804           op1 = gimple_assign_rhs2 (def_stmt);
3805         }
3806       else
3807         {
3808           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3809             return false;
3810           code = TREE_CODE (off);
3811           extract_ops_from_tree (off, &code, &op0, &op1);
3812         }
3813       switch (code)
3814         {
3815         case POINTER_PLUS_EXPR:
3816         case PLUS_EXPR:
3817           if (expr_invariant_in_loop_p (loop, op0))
3818             {
3819               add = op0;
3820               off = op1;
3821             do_add:
3822               add = fold_convert (sizetype, add);
3823               if (scale != 1)
3824                 add = size_binop (MULT_EXPR, add, size_int (scale));
3825               base = size_binop (PLUS_EXPR, base, add);
3826               continue;
3827             }
3828           if (expr_invariant_in_loop_p (loop, op1))
3829             {
3830               add = op1;
3831               off = op0;
3832               goto do_add;
3833             }
3834           break;
3835         case MINUS_EXPR:
3836           if (expr_invariant_in_loop_p (loop, op1))
3837             {
3838               add = fold_convert (sizetype, op1);
3839               add = size_binop (MINUS_EXPR, size_zero_node, add);
3840               off = op0;
3841               goto do_add;
3842             }
3843           break;
3844         case MULT_EXPR:
3845           if (scale == 1 && tree_fits_shwi_p (op1))
3846             {
3847               int new_scale = tree_to_shwi (op1);
3848               /* Only treat this as a scaling operation if the target
3849                  supports it.  */
3850               if (use_ifn_p
3851                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3852                                                 vectype, memory_type, 1,
3853                                                 TYPE_SIGN (TREE_TYPE (op0)),
3854                                                 new_scale, &ifn,
3855                                                 &element_type))
3856                 break;
3857               scale = new_scale;
3858               off = op0;
3859               continue;
3860             }
3861           break;
3862         case SSA_NAME:
3863           off = op0;
3864           continue;
3865         CASE_CONVERT:
3866           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3867               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3868             break;
3869           if (TYPE_PRECISION (TREE_TYPE (op0))
3870               == TYPE_PRECISION (TREE_TYPE (off)))
3871             {
3872               off = op0;
3873               continue;
3874             }
3875
3876           /* The internal functions need the offset to be the same width
3877              as the elements of VECTYPE.  Don't include operations that
3878              cast the offset from that width to a different width.  */
3879           if (use_ifn_p
3880               && (int_size_in_bytes (TREE_TYPE (vectype))
3881                   == int_size_in_bytes (TREE_TYPE (off))))
3882             break;
3883
3884           if (TYPE_PRECISION (TREE_TYPE (op0))
3885               < TYPE_PRECISION (TREE_TYPE (off)))
3886             {
3887               off = op0;
3888               offtype = TREE_TYPE (off);
3889               STRIP_NOPS (off);
3890               continue;
3891             }
3892           break;
3893         default:
3894           break;
3895         }
3896       break;
3897     }
3898
3899   /* If at the end OFF still isn't a SSA_NAME or isn't
3900      defined in the loop, punt.  */
3901   if (TREE_CODE (off) != SSA_NAME
3902       || expr_invariant_in_loop_p (loop, off))
3903     return false;
3904
3905   if (offtype == NULL_TREE)
3906     offtype = TREE_TYPE (off);
3907
3908   if (use_ifn_p)
3909     {
3910       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3911                                      memory_type, TYPE_PRECISION (offtype),
3912                                      TYPE_SIGN (offtype), scale, &ifn,
3913                                      &element_type))
3914         return false;
3915     }
3916   else
3917     {
3918       if (DR_IS_READ (dr))
3919         {
3920           if (targetm.vectorize.builtin_gather)
3921             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3922         }
3923       else
3924         {
3925           if (targetm.vectorize.builtin_scatter)
3926             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3927         }
3928
3929       if (!decl)
3930         return false;
3931
3932       ifn = IFN_LAST;
3933       element_type = TREE_TYPE (vectype);
3934     }
3935
3936   info->ifn = ifn;
3937   info->decl = decl;
3938   info->base = base;
3939   info->offset = off;
3940   info->offset_dt = vect_unknown_def_type;
3941   info->offset_vectype = NULL_TREE;
3942   info->scale = scale;
3943   info->element_type = element_type;
3944   info->memory_type = memory_type;
3945   return true;
3946 }
3947
3948 /* Find the data references in STMT, analyze them with respect to LOOP and
3949    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3950    be handled.  */
3951
3952 bool
3953 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3954                                vec<data_reference_p> *datarefs)
3955 {
3956   /* We can ignore clobbers for dataref analysis - they are removed during
3957      loop vectorization and BB vectorization checks dependences with a
3958      stmt walk.  */
3959   if (gimple_clobber_p (stmt))
3960     return true;
3961
3962   if (gimple_has_volatile_ops (stmt))
3963     {
3964       if (dump_enabled_p ())
3965         {
3966           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3967                            "not vectorized: volatile type ");
3968           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3969         }
3970       return false;
3971     }
3972
3973   if (stmt_can_throw_internal (stmt))
3974     {
3975       if (dump_enabled_p ())
3976         {
3977           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3978                            "not vectorized: statement can throw an "
3979                            "exception ");
3980           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3981         }
3982       return false;
3983     }
3984
3985   auto_vec<data_reference_p, 2> refs;
3986   if (!find_data_references_in_stmt (loop, stmt, &refs))
3987     return false;
3988
3989   if (refs.is_empty ())
3990     return true;
3991
3992   if (refs.length () > 1)
3993     {
3994       if (dump_enabled_p ())
3995         {
3996           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3997                            "not vectorized: more than one data ref "
3998                            "in stmt: ");
3999           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4000         }
4001       return false;
4002     }
4003
4004   if (gcall *call = dyn_cast <gcall *> (stmt))
4005     if (!gimple_call_internal_p (call)
4006         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4007             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4008       {
4009         if (dump_enabled_p ())
4010           {
4011             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4012                              "not vectorized: dr in a call ");
4013             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4014           }
4015         return false;
4016       }
4017
4018   data_reference_p dr = refs.pop ();
4019   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4020       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4021     {
4022       if (dump_enabled_p ())
4023         {
4024           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4025                            "not vectorized: statement is bitfield "
4026                            "access ");
4027           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4028         }
4029       return false;
4030     }
4031
4032   if (DR_BASE_ADDRESS (dr)
4033       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4034     {
4035       if (dump_enabled_p ())
4036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4037                          "not vectorized: base addr of dr is a "
4038                          "constant\n");
4039       return false;
4040     }
4041
4042   datarefs->safe_push (dr);
4043   return true;
4044 }
4045
4046 /* Function vect_analyze_data_refs.
4047
4048   Find all the data references in the loop or basic block.
4049
4050    The general structure of the analysis of data refs in the vectorizer is as
4051    follows:
4052    1- vect_analyze_data_refs(loop/bb): call
4053       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4054       in the loop/bb and their dependences.
4055    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4056    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4057    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4058
4059 */
4060
4061 bool
4062 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4063 {
4064   struct loop *loop = NULL;
4065   unsigned int i;
4066   struct data_reference *dr;
4067   tree scalar_type;
4068
4069   if (dump_enabled_p ())
4070     dump_printf_loc (MSG_NOTE, vect_location,
4071                      "=== vect_analyze_data_refs ===\n");
4072
4073   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4074     loop = LOOP_VINFO_LOOP (loop_vinfo);
4075
4076   /* Go through the data-refs, check that the analysis succeeded.  Update
4077      pointer from stmt_vec_info struct to DR and vectype.  */
4078
4079   vec<data_reference_p> datarefs = vinfo->datarefs;
4080   FOR_EACH_VEC_ELT (datarefs, i, dr)
4081     {
4082       gimple *stmt;
4083       stmt_vec_info stmt_info;
4084       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4085       bool simd_lane_access = false;
4086       poly_uint64 vf;
4087
4088       gcc_assert (DR_REF (dr));
4089       stmt = DR_STMT (dr);
4090       stmt_info = vinfo_for_stmt (stmt);
4091
4092       /* Check that analysis of the data-ref succeeded.  */
4093       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4094           || !DR_STEP (dr))
4095         {
4096           bool maybe_gather
4097             = DR_IS_READ (dr)
4098               && !TREE_THIS_VOLATILE (DR_REF (dr))
4099               && (targetm.vectorize.builtin_gather != NULL
4100                   || supports_vec_gather_load_p ());
4101           bool maybe_scatter
4102             = DR_IS_WRITE (dr)
4103               && !TREE_THIS_VOLATILE (DR_REF (dr))
4104               && (targetm.vectorize.builtin_scatter != NULL
4105                   || supports_vec_scatter_store_p ());
4106           bool maybe_simd_lane_access
4107             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4108
4109           /* If target supports vector gather loads or scatter stores, or if
4110              this might be a SIMD lane access, see if they can't be used.  */
4111           if (is_a <loop_vec_info> (vinfo)
4112               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4113               && !nested_in_vect_loop_p (loop, stmt))
4114             {
4115               struct data_reference *newdr
4116                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4117                                    DR_REF (dr), stmt, !maybe_scatter,
4118                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4119               gcc_assert (newdr != NULL && DR_REF (newdr));
4120               if (DR_BASE_ADDRESS (newdr)
4121                   && DR_OFFSET (newdr)
4122                   && DR_INIT (newdr)
4123                   && DR_STEP (newdr)
4124                   && integer_zerop (DR_STEP (newdr)))
4125                 {
4126                   if (maybe_simd_lane_access)
4127                     {
4128                       tree off = DR_OFFSET (newdr);
4129                       STRIP_NOPS (off);
4130                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4131                           && TREE_CODE (off) == MULT_EXPR
4132                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4133                         {
4134                           tree step = TREE_OPERAND (off, 1);
4135                           off = TREE_OPERAND (off, 0);
4136                           STRIP_NOPS (off);
4137                           if (CONVERT_EXPR_P (off)
4138                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4139                                                                           0)))
4140                                  < TYPE_PRECISION (TREE_TYPE (off)))
4141                             off = TREE_OPERAND (off, 0);
4142                           if (TREE_CODE (off) == SSA_NAME)
4143                             {
4144                               gimple *def = SSA_NAME_DEF_STMT (off);
4145                               tree reft = TREE_TYPE (DR_REF (newdr));
4146                               if (is_gimple_call (def)
4147                                   && gimple_call_internal_p (def)
4148                                   && (gimple_call_internal_fn (def)
4149                                       == IFN_GOMP_SIMD_LANE))
4150                                 {
4151                                   tree arg = gimple_call_arg (def, 0);
4152                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4153                                   arg = SSA_NAME_VAR (arg);
4154                                   if (arg == loop->simduid
4155                                       /* For now.  */
4156                                       && tree_int_cst_equal
4157                                            (TYPE_SIZE_UNIT (reft),
4158                                             step))
4159                                     {
4160                                       DR_OFFSET (newdr) = ssize_int (0);
4161                                       DR_STEP (newdr) = step;
4162                                       DR_OFFSET_ALIGNMENT (newdr)
4163                                         = BIGGEST_ALIGNMENT;
4164                                       DR_STEP_ALIGNMENT (newdr)
4165                                         = highest_pow2_factor (step);
4166                                       dr = newdr;
4167                                       simd_lane_access = true;
4168                                     }
4169                                 }
4170                             }
4171                         }
4172                     }
4173                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4174                     {
4175                       dr = newdr;
4176                       if (maybe_gather)
4177                         gatherscatter = GATHER;
4178                       else
4179                         gatherscatter = SCATTER;
4180                     }
4181                 }
4182               if (gatherscatter == SG_NONE && !simd_lane_access)
4183                 free_data_ref (newdr);
4184             }
4185
4186           if (gatherscatter == SG_NONE && !simd_lane_access)
4187             {
4188               if (dump_enabled_p ())
4189                 {
4190                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4191                                    "not vectorized: data ref analysis "
4192                                    "failed ");
4193                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4194                 }
4195               if (is_a <bb_vec_info> (vinfo))
4196                 {
4197                   /* In BB vectorization the ref can still participate
4198                      in dependence analysis, we just can't vectorize it.  */
4199                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4200                   continue;
4201                 }
4202               return false;
4203             }
4204         }
4205
4206       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4207           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4208           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4209         {
4210           if (dump_enabled_p ())
4211             {
4212               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4213                                "not vectorized: base object not addressable "
4214                                "for stmt: ");
4215               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4216             }
4217           if (is_a <bb_vec_info> (vinfo))
4218             {
4219               /* In BB vectorization the ref can still participate
4220                  in dependence analysis, we just can't vectorize it.  */
4221               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4222               continue;
4223             }
4224           return false;
4225         }
4226
4227       if (is_a <loop_vec_info> (vinfo)
4228           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4229         {
4230           if (nested_in_vect_loop_p (loop, stmt))
4231             {
4232               if (dump_enabled_p ())
4233                 {
4234                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4235                                    "not vectorized: not suitable for strided "
4236                                    "load ");
4237                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4238                 }
4239               return false;
4240             }
4241           STMT_VINFO_STRIDED_P (stmt_info) = true;
4242         }
4243
4244       /* Update DR field in stmt_vec_info struct.  */
4245
4246       /* If the dataref is in an inner-loop of the loop that is considered for
4247          for vectorization, we also want to analyze the access relative to
4248          the outer-loop (DR contains information only relative to the
4249          inner-most enclosing loop).  We do that by building a reference to the
4250          first location accessed by the inner-loop, and analyze it relative to
4251          the outer-loop.  */
4252       if (loop && nested_in_vect_loop_p (loop, stmt))
4253         {
4254           /* Build a reference to the first location accessed by the
4255              inner loop: *(BASE + INIT + OFFSET).  By construction,
4256              this address must be invariant in the inner loop, so we
4257              can consider it as being used in the outer loop.  */
4258           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4259           tree offset = unshare_expr (DR_OFFSET (dr));
4260           tree init = unshare_expr (DR_INIT (dr));
4261           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4262                                           init, offset);
4263           tree init_addr = fold_build_pointer_plus (base, init_offset);
4264           tree init_ref = build_fold_indirect_ref (init_addr);
4265
4266           if (dump_enabled_p ())
4267             {
4268               dump_printf_loc (MSG_NOTE, vect_location,
4269                                "analyze in outer loop: ");
4270               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4271               dump_printf (MSG_NOTE, "\n");
4272             }
4273
4274           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4275                                      init_ref, loop))
4276             /* dr_analyze_innermost already explained the failure.  */
4277             return false;
4278
4279           if (dump_enabled_p ())
4280             {
4281               dump_printf_loc (MSG_NOTE, vect_location,
4282                                "\touter base_address: ");
4283               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4284                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4285               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4286               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4287                                  STMT_VINFO_DR_OFFSET (stmt_info));
4288               dump_printf (MSG_NOTE,
4289                            "\n\touter constant offset from base address: ");
4290               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4291                                  STMT_VINFO_DR_INIT (stmt_info));
4292               dump_printf (MSG_NOTE, "\n\touter step: ");
4293               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4294                                  STMT_VINFO_DR_STEP (stmt_info));
4295               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4296                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4297               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4298                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4299               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4300                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4301               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4302                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4303             }
4304         }
4305
4306       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4307       STMT_VINFO_DATA_REF (stmt_info) = dr;
4308       if (simd_lane_access)
4309         {
4310           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4311           free_data_ref (datarefs[i]);
4312           datarefs[i] = dr;
4313         }
4314
4315       /* Set vectype for STMT.  */
4316       scalar_type = TREE_TYPE (DR_REF (dr));
4317       STMT_VINFO_VECTYPE (stmt_info)
4318         = get_vectype_for_scalar_type (scalar_type);
4319       if (!STMT_VINFO_VECTYPE (stmt_info))
4320         {
4321           if (dump_enabled_p ())
4322             {
4323               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4324                                "not vectorized: no vectype for stmt: ");
4325               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4326               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4327               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4328                                  scalar_type);
4329               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4330             }
4331
4332           if (is_a <bb_vec_info> (vinfo))
4333             {
4334               /* No vector type is fine, the ref can still participate
4335                  in dependence analysis, we just can't vectorize it.  */
4336               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4337               continue;
4338             }
4339
4340           if (gatherscatter != SG_NONE || simd_lane_access)
4341             {
4342               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4343               if (gatherscatter != SG_NONE)
4344                 free_data_ref (dr);
4345             }
4346           return false;
4347         }
4348       else
4349         {
4350           if (dump_enabled_p ())
4351             {
4352               dump_printf_loc (MSG_NOTE, vect_location,
4353                                "got vectype for stmt: ");
4354               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4355               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4356                                  STMT_VINFO_VECTYPE (stmt_info));
4357               dump_printf (MSG_NOTE, "\n");
4358             }
4359         }
4360
4361       /* Adjust the minimal vectorization factor according to the
4362          vector type.  */
4363       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4364       *min_vf = upper_bound (*min_vf, vf);
4365
4366       if (gatherscatter != SG_NONE)
4367         {
4368           gather_scatter_info gs_info;
4369           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4370                                           &gs_info)
4371               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4372             {
4373               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4374               free_data_ref (dr);
4375               if (dump_enabled_p ())
4376                 {
4377                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4378                                    (gatherscatter == GATHER) ?
4379                                    "not vectorized: not suitable for gather "
4380                                    "load " :
4381                                    "not vectorized: not suitable for scatter "
4382                                    "store ");
4383                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4384                 }
4385               return false;
4386             }
4387
4388           free_data_ref (datarefs[i]);
4389           datarefs[i] = dr;
4390           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4391         }
4392     }
4393
4394   /* We used to stop processing and prune the list here.  Verify we no
4395      longer need to.  */
4396   gcc_assert (i == datarefs.length ());
4397
4398   return true;
4399 }
4400
4401
4402 /* Function vect_get_new_vect_var.
4403
4404    Returns a name for a new variable.  The current naming scheme appends the
4405    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4406    the name of vectorizer generated variables, and appends that to NAME if
4407    provided.  */
4408
4409 tree
4410 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4411 {
4412   const char *prefix;
4413   tree new_vect_var;
4414
4415   switch (var_kind)
4416   {
4417   case vect_simple_var:
4418     prefix = "vect";
4419     break;
4420   case vect_scalar_var:
4421     prefix = "stmp";
4422     break;
4423   case vect_mask_var:
4424     prefix = "mask";
4425     break;
4426   case vect_pointer_var:
4427     prefix = "vectp";
4428     break;
4429   default:
4430     gcc_unreachable ();
4431   }
4432
4433   if (name)
4434     {
4435       char* tmp = concat (prefix, "_", name, NULL);
4436       new_vect_var = create_tmp_reg (type, tmp);
4437       free (tmp);
4438     }
4439   else
4440     new_vect_var = create_tmp_reg (type, prefix);
4441
4442   return new_vect_var;
4443 }
4444
4445 /* Like vect_get_new_vect_var but return an SSA name.  */
4446
4447 tree
4448 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4449 {
4450   const char *prefix;
4451   tree new_vect_var;
4452
4453   switch (var_kind)
4454   {
4455   case vect_simple_var:
4456     prefix = "vect";
4457     break;
4458   case vect_scalar_var:
4459     prefix = "stmp";
4460     break;
4461   case vect_pointer_var:
4462     prefix = "vectp";
4463     break;
4464   default:
4465     gcc_unreachable ();
4466   }
4467
4468   if (name)
4469     {
4470       char* tmp = concat (prefix, "_", name, NULL);
4471       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4472       free (tmp);
4473     }
4474   else
4475     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4476
4477   return new_vect_var;
4478 }
4479
4480 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4481
4482 static void
4483 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4484 {
4485   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4486   int misalign = DR_MISALIGNMENT (dr);
4487   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4488     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4489   else
4490     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4491                             DR_TARGET_ALIGNMENT (dr), misalign);
4492 }
4493
4494 /* Function vect_create_addr_base_for_vector_ref.
4495
4496    Create an expression that computes the address of the first memory location
4497    that will be accessed for a data reference.
4498
4499    Input:
4500    STMT: The statement containing the data reference.
4501    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4502    OFFSET: Optional. If supplied, it is be added to the initial address.
4503    LOOP:    Specify relative to which loop-nest should the address be computed.
4504             For example, when the dataref is in an inner-loop nested in an
4505             outer-loop that is now being vectorized, LOOP can be either the
4506             outer-loop, or the inner-loop.  The first memory location accessed
4507             by the following dataref ('in' points to short):
4508
4509                 for (i=0; i<N; i++)
4510                    for (j=0; j<M; j++)
4511                      s += in[i+j]
4512
4513             is as follows:
4514             if LOOP=i_loop:     &in             (relative to i_loop)
4515             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4516    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4517             initial address.  Unlike OFFSET, which is number of elements to
4518             be added, BYTE_OFFSET is measured in bytes.
4519
4520    Output:
4521    1. Return an SSA_NAME whose value is the address of the memory location of
4522       the first vector of the data reference.
4523    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4524       these statement(s) which define the returned SSA_NAME.
4525
4526    FORNOW: We are only handling array accesses with step 1.  */
4527
4528 tree
4529 vect_create_addr_base_for_vector_ref (gimple *stmt,
4530                                       gimple_seq *new_stmt_list,
4531                                       tree offset,
4532                                       tree byte_offset)
4533 {
4534   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4535   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4536   const char *base_name;
4537   tree addr_base;
4538   tree dest;
4539   gimple_seq seq = NULL;
4540   tree vect_ptr_type;
4541   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4542   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4543   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4544
4545   tree data_ref_base = unshare_expr (drb->base_address);
4546   tree base_offset = unshare_expr (drb->offset);
4547   tree init = unshare_expr (drb->init);
4548
4549   if (loop_vinfo)
4550     base_name = get_name (data_ref_base);
4551   else
4552     {
4553       base_offset = ssize_int (0);
4554       init = ssize_int (0);
4555       base_name = get_name (DR_REF (dr));
4556     }
4557
4558   /* Create base_offset */
4559   base_offset = size_binop (PLUS_EXPR,
4560                             fold_convert (sizetype, base_offset),
4561                             fold_convert (sizetype, init));
4562
4563   if (offset)
4564     {
4565       offset = fold_build2 (MULT_EXPR, sizetype,
4566                             fold_convert (sizetype, offset), step);
4567       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4568                                  base_offset, offset);
4569     }
4570   if (byte_offset)
4571     {
4572       byte_offset = fold_convert (sizetype, byte_offset);
4573       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4574                                  base_offset, byte_offset);
4575     }
4576
4577   /* base + base_offset */
4578   if (loop_vinfo)
4579     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4580   else
4581     {
4582       addr_base = build1 (ADDR_EXPR,
4583                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4584                           unshare_expr (DR_REF (dr)));
4585     }
4586
4587   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4588   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4589   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4590   gimple_seq_add_seq (new_stmt_list, seq);
4591
4592   if (DR_PTR_INFO (dr)
4593       && TREE_CODE (addr_base) == SSA_NAME
4594       && !SSA_NAME_PTR_INFO (addr_base))
4595     {
4596       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4597       if (offset || byte_offset)
4598         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4599     }
4600
4601   if (dump_enabled_p ())
4602     {
4603       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4604       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4605       dump_printf (MSG_NOTE, "\n");
4606     }
4607
4608   return addr_base;
4609 }
4610
4611
4612 /* Function vect_create_data_ref_ptr.
4613
4614    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4615    location accessed in the loop by STMT, along with the def-use update
4616    chain to appropriately advance the pointer through the loop iterations.
4617    Also set aliasing information for the pointer.  This pointer is used by
4618    the callers to this function to create a memory reference expression for
4619    vector load/store access.
4620
4621    Input:
4622    1. STMT: a stmt that references memory. Expected to be of the form
4623          GIMPLE_ASSIGN <name, data-ref> or
4624          GIMPLE_ASSIGN <data-ref, name>.
4625    2. AGGR_TYPE: the type of the reference, which should be either a vector
4626         or an array.
4627    3. AT_LOOP: the loop where the vector memref is to be created.
4628    4. OFFSET (optional): an offset to be added to the initial address accessed
4629         by the data-ref in STMT.
4630    5. BSI: location where the new stmts are to be placed if there is no loop
4631    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4632         pointing to the initial address.
4633    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4634         to the initial address accessed by the data-ref in STMT.  This is
4635         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4636         in bytes.
4637    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4638         to the IV during each iteration of the loop.  NULL says to move
4639         by one copy of AGGR_TYPE up or down, depending on the step of the
4640         data reference.
4641
4642    Output:
4643    1. Declare a new ptr to vector_type, and have it point to the base of the
4644       data reference (initial addressed accessed by the data reference).
4645       For example, for vector of type V8HI, the following code is generated:
4646
4647       v8hi *ap;
4648       ap = (v8hi *)initial_address;
4649
4650       if OFFSET is not supplied:
4651          initial_address = &a[init];
4652       if OFFSET is supplied:
4653          initial_address = &a[init + OFFSET];
4654       if BYTE_OFFSET is supplied:
4655          initial_address = &a[init] + BYTE_OFFSET;
4656
4657       Return the initial_address in INITIAL_ADDRESS.
4658
4659    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4660       update the pointer in each iteration of the loop.
4661
4662       Return the increment stmt that updates the pointer in PTR_INCR.
4663
4664    3. Set INV_P to true if the access pattern of the data reference in the
4665       vectorized loop is invariant.  Set it to false otherwise.
4666
4667    4. Return the pointer.  */
4668
4669 tree
4670 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4671                           tree offset, tree *initial_address,
4672                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4673                           bool only_init, bool *inv_p, tree byte_offset,
4674                           tree iv_step)
4675 {
4676   const char *base_name;
4677   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4678   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4679   struct loop *loop = NULL;
4680   bool nested_in_vect_loop = false;
4681   struct loop *containing_loop = NULL;
4682   tree aggr_ptr_type;
4683   tree aggr_ptr;
4684   tree new_temp;
4685   gimple_seq new_stmt_list = NULL;
4686   edge pe = NULL;
4687   basic_block new_bb;
4688   tree aggr_ptr_init;
4689   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4690   tree aptr;
4691   gimple_stmt_iterator incr_gsi;
4692   bool insert_after;
4693   tree indx_before_incr, indx_after_incr;
4694   gimple *incr;
4695   tree step;
4696   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4697
4698   gcc_assert (iv_step != NULL_TREE
4699               || TREE_CODE (aggr_type) == ARRAY_TYPE
4700               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4701
4702   if (loop_vinfo)
4703     {
4704       loop = LOOP_VINFO_LOOP (loop_vinfo);
4705       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4706       containing_loop = (gimple_bb (stmt))->loop_father;
4707       pe = loop_preheader_edge (loop);
4708     }
4709   else
4710     {
4711       gcc_assert (bb_vinfo);
4712       only_init = true;
4713       *ptr_incr = NULL;
4714     }
4715
4716   /* Check the step (evolution) of the load in LOOP, and record
4717      whether it's invariant.  */
4718   step = vect_dr_behavior (dr)->step;
4719   if (integer_zerop (step))
4720     *inv_p = true;
4721   else
4722     *inv_p = false;
4723
4724   /* Create an expression for the first address accessed by this load
4725      in LOOP.  */
4726   base_name = get_name (DR_BASE_ADDRESS (dr));
4727
4728   if (dump_enabled_p ())
4729     {
4730       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4731       dump_printf_loc (MSG_NOTE, vect_location,
4732                        "create %s-pointer variable to type: ",
4733                        get_tree_code_name (TREE_CODE (aggr_type)));
4734       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4735       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4736         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4737       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4738         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4739       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4740         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4741       else
4742         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4743       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4744       dump_printf (MSG_NOTE, "\n");
4745     }
4746
4747   /* (1) Create the new aggregate-pointer variable.
4748      Vector and array types inherit the alias set of their component
4749      type by default so we need to use a ref-all pointer if the data
4750      reference does not conflict with the created aggregated data
4751      reference because it is not addressable.  */
4752   bool need_ref_all = false;
4753   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4754                               get_alias_set (DR_REF (dr))))
4755     need_ref_all = true;
4756   /* Likewise for any of the data references in the stmt group.  */
4757   else if (DR_GROUP_SIZE (stmt_info) > 1)
4758     {
4759       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4760       do
4761         {
4762           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4763           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4764           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4765                                       get_alias_set (DR_REF (sdr))))
4766             {
4767               need_ref_all = true;
4768               break;
4769             }
4770           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4771         }
4772       while (orig_stmt);
4773     }
4774   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4775                                                need_ref_all);
4776   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4777
4778
4779   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4780      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4781      def-use update cycles for the pointer: one relative to the outer-loop
4782      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4783      to the inner-loop (which is the inner-most loop containing the dataref),
4784      and this is done be step (5) below.
4785
4786      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4787      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4788      redundant.  Steps (3),(4) create the following:
4789
4790         vp0 = &base_addr;
4791         LOOP:   vp1 = phi(vp0,vp2)
4792                 ...
4793                 ...
4794                 vp2 = vp1 + step
4795                 goto LOOP
4796
4797      If there is an inner-loop nested in loop, then step (5) will also be
4798      applied, and an additional update in the inner-loop will be created:
4799
4800         vp0 = &base_addr;
4801         LOOP:   vp1 = phi(vp0,vp2)
4802                 ...
4803         inner:     vp3 = phi(vp1,vp4)
4804                    vp4 = vp3 + inner_step
4805                    if () goto inner
4806                 ...
4807                 vp2 = vp1 + step
4808                 if () goto LOOP   */
4809
4810   /* (2) Calculate the initial address of the aggregate-pointer, and set
4811      the aggregate-pointer to point to it before the loop.  */
4812
4813   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4814
4815   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4816                                                    offset, byte_offset);
4817   if (new_stmt_list)
4818     {
4819       if (pe)
4820         {
4821           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4822           gcc_assert (!new_bb);
4823         }
4824       else
4825         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4826     }
4827
4828   *initial_address = new_temp;
4829   aggr_ptr_init = new_temp;
4830
4831   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4832      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4833      inner-loop nested in LOOP (during outer-loop vectorization).  */
4834
4835   /* No update in loop is required.  */
4836   if (only_init && (!loop_vinfo || at_loop == loop))
4837     aptr = aggr_ptr_init;
4838   else
4839     {
4840       if (iv_step == NULL_TREE)
4841         {
4842           /* The step of the aggregate pointer is the type size.  */
4843           iv_step = TYPE_SIZE_UNIT (aggr_type);
4844           /* One exception to the above is when the scalar step of the load in
4845              LOOP is zero. In this case the step here is also zero.  */
4846           if (*inv_p)
4847             iv_step = size_zero_node;
4848           else if (tree_int_cst_sgn (step) == -1)
4849             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4850         }
4851
4852       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4853
4854       create_iv (aggr_ptr_init,
4855                  fold_convert (aggr_ptr_type, iv_step),
4856                  aggr_ptr, loop, &incr_gsi, insert_after,
4857                  &indx_before_incr, &indx_after_incr);
4858       incr = gsi_stmt (incr_gsi);
4859       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4860
4861       /* Copy the points-to information if it exists. */
4862       if (DR_PTR_INFO (dr))
4863         {
4864           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4865           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4866         }
4867       if (ptr_incr)
4868         *ptr_incr = incr;
4869
4870       aptr = indx_before_incr;
4871     }
4872
4873   if (!nested_in_vect_loop || only_init)
4874     return aptr;
4875
4876
4877   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4878      nested in LOOP, if exists.  */
4879
4880   gcc_assert (nested_in_vect_loop);
4881   if (!only_init)
4882     {
4883       standard_iv_increment_position (containing_loop, &incr_gsi,
4884                                       &insert_after);
4885       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4886                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4887                  &indx_after_incr);
4888       incr = gsi_stmt (incr_gsi);
4889       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4890
4891       /* Copy the points-to information if it exists. */
4892       if (DR_PTR_INFO (dr))
4893         {
4894           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4895           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4896         }
4897       if (ptr_incr)
4898         *ptr_incr = incr;
4899
4900       return indx_before_incr;
4901     }
4902   else
4903     gcc_unreachable ();
4904 }
4905
4906
4907 /* Function bump_vector_ptr
4908
4909    Increment a pointer (to a vector type) by vector-size. If requested,
4910    i.e. if PTR-INCR is given, then also connect the new increment stmt
4911    to the existing def-use update-chain of the pointer, by modifying
4912    the PTR_INCR as illustrated below:
4913
4914    The pointer def-use update-chain before this function:
4915                         DATAREF_PTR = phi (p_0, p_2)
4916                         ....
4917         PTR_INCR:       p_2 = DATAREF_PTR + step
4918
4919    The pointer def-use update-chain after this function:
4920                         DATAREF_PTR = phi (p_0, p_2)
4921                         ....
4922                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4923                         ....
4924         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4925
4926    Input:
4927    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4928                  in the loop.
4929    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4930               the loop.  The increment amount across iterations is expected
4931               to be vector_size.
4932    BSI - location where the new update stmt is to be placed.
4933    STMT - the original scalar memory-access stmt that is being vectorized.
4934    BUMP - optional. The offset by which to bump the pointer. If not given,
4935           the offset is assumed to be vector_size.
4936
4937    Output: Return NEW_DATAREF_PTR as illustrated above.
4938
4939 */
4940
4941 tree
4942 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4943                  gimple *stmt, tree bump)
4944 {
4945   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4946   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4947   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4948   tree update = TYPE_SIZE_UNIT (vectype);
4949   gassign *incr_stmt;
4950   ssa_op_iter iter;
4951   use_operand_p use_p;
4952   tree new_dataref_ptr;
4953
4954   if (bump)
4955     update = bump;
4956
4957   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4958     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4959   else
4960     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4961   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4962                                    dataref_ptr, update);
4963   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4964
4965   /* Copy the points-to information if it exists. */
4966   if (DR_PTR_INFO (dr))
4967     {
4968       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4969       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4970     }
4971
4972   if (!ptr_incr)
4973     return new_dataref_ptr;
4974
4975   /* Update the vector-pointer's cross-iteration increment.  */
4976   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4977     {
4978       tree use = USE_FROM_PTR (use_p);
4979
4980       if (use == dataref_ptr)
4981         SET_USE (use_p, new_dataref_ptr);
4982       else
4983         gcc_assert (operand_equal_p (use, update, 0));
4984     }
4985
4986   return new_dataref_ptr;
4987 }
4988
4989
4990 /* Copy memory reference info such as base/clique from the SRC reference
4991    to the DEST MEM_REF.  */
4992
4993 void
4994 vect_copy_ref_info (tree dest, tree src)
4995 {
4996   if (TREE_CODE (dest) != MEM_REF)
4997     return;
4998
4999   tree src_base = src;
5000   while (handled_component_p (src_base))
5001     src_base = TREE_OPERAND (src_base, 0);
5002   if (TREE_CODE (src_base) != MEM_REF
5003       && TREE_CODE (src_base) != TARGET_MEM_REF)
5004     return;
5005
5006   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5007   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5008 }
5009
5010
5011 /* Function vect_create_destination_var.
5012
5013    Create a new temporary of type VECTYPE.  */
5014
5015 tree
5016 vect_create_destination_var (tree scalar_dest, tree vectype)
5017 {
5018   tree vec_dest;
5019   const char *name;
5020   char *new_name;
5021   tree type;
5022   enum vect_var_kind kind;
5023
5024   kind = vectype
5025     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5026     ? vect_mask_var
5027     : vect_simple_var
5028     : vect_scalar_var;
5029   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5030
5031   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5032
5033   name = get_name (scalar_dest);
5034   if (name)
5035     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5036   else
5037     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5038   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5039   free (new_name);
5040
5041   return vec_dest;
5042 }
5043
5044 /* Function vect_grouped_store_supported.
5045
5046    Returns TRUE if interleave high and interleave low permutations
5047    are supported, and FALSE otherwise.  */
5048
5049 bool
5050 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5051 {
5052   machine_mode mode = TYPE_MODE (vectype);
5053
5054   /* vect_permute_store_chain requires the group size to be equal to 3 or
5055      be a power of two.  */
5056   if (count != 3 && exact_log2 (count) == -1)
5057     {
5058       if (dump_enabled_p ())
5059         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060                          "the size of the group of accesses"
5061                          " is not a power of 2 or not eqaul to 3\n");
5062       return false;
5063     }
5064
5065   /* Check that the permutation is supported.  */
5066   if (VECTOR_MODE_P (mode))
5067     {
5068       unsigned int i;
5069       if (count == 3)
5070         {
5071           unsigned int j0 = 0, j1 = 0, j2 = 0;
5072           unsigned int i, j;
5073
5074           unsigned int nelt;
5075           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5076             {
5077               if (dump_enabled_p ())
5078                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079                                  "cannot handle groups of 3 stores for"
5080                                  " variable-length vectors\n");
5081               return false;
5082             }
5083
5084           vec_perm_builder sel (nelt, nelt, 1);
5085           sel.quick_grow (nelt);
5086           vec_perm_indices indices;
5087           for (j = 0; j < 3; j++)
5088             {
5089               int nelt0 = ((3 - j) * nelt) % 3;
5090               int nelt1 = ((3 - j) * nelt + 1) % 3;
5091               int nelt2 = ((3 - j) * nelt + 2) % 3;
5092               for (i = 0; i < nelt; i++)
5093                 {
5094                   if (3 * i + nelt0 < nelt)
5095                     sel[3 * i + nelt0] = j0++;
5096                   if (3 * i + nelt1 < nelt)
5097                     sel[3 * i + nelt1] = nelt + j1++;
5098                   if (3 * i + nelt2 < nelt)
5099                     sel[3 * i + nelt2] = 0;
5100                 }
5101               indices.new_vector (sel, 2, nelt);
5102               if (!can_vec_perm_const_p (mode, indices))
5103                 {
5104                   if (dump_enabled_p ())
5105                     dump_printf (MSG_MISSED_OPTIMIZATION,
5106                                  "permutation op not supported by target.\n");
5107                   return false;
5108                 }
5109
5110               for (i = 0; i < nelt; i++)
5111                 {
5112                   if (3 * i + nelt0 < nelt)
5113                     sel[3 * i + nelt0] = 3 * i + nelt0;
5114                   if (3 * i + nelt1 < nelt)
5115                     sel[3 * i + nelt1] = 3 * i + nelt1;
5116                   if (3 * i + nelt2 < nelt)
5117                     sel[3 * i + nelt2] = nelt + j2++;
5118                 }
5119               indices.new_vector (sel, 2, nelt);
5120               if (!can_vec_perm_const_p (mode, indices))
5121                 {
5122                   if (dump_enabled_p ())
5123                     dump_printf (MSG_MISSED_OPTIMIZATION,
5124                                  "permutation op not supported by target.\n");
5125                   return false;
5126                 }
5127             }
5128           return true;
5129         }
5130       else
5131         {
5132           /* If length is not equal to 3 then only power of 2 is supported.  */
5133           gcc_assert (pow2p_hwi (count));
5134           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5135
5136           /* The encoding has 2 interleaved stepped patterns.  */
5137           vec_perm_builder sel (nelt, 2, 3);
5138           sel.quick_grow (6);
5139           for (i = 0; i < 3; i++)
5140             {
5141               sel[i * 2] = i;
5142               sel[i * 2 + 1] = i + nelt;
5143             }
5144           vec_perm_indices indices (sel, 2, nelt);
5145           if (can_vec_perm_const_p (mode, indices))
5146             {
5147               for (i = 0; i < 6; i++)
5148                 sel[i] += exact_div (nelt, 2);
5149               indices.new_vector (sel, 2, nelt);
5150               if (can_vec_perm_const_p (mode, indices))
5151                 return true;
5152             }
5153         }
5154     }
5155
5156   if (dump_enabled_p ())
5157     dump_printf (MSG_MISSED_OPTIMIZATION,
5158                  "permutaion op not supported by target.\n");
5159   return false;
5160 }
5161
5162
5163 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5164    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5165
5166 bool
5167 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5168                             bool masked_p)
5169 {
5170   if (masked_p)
5171     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5172                                          vec_mask_store_lanes_optab,
5173                                          vectype, count);
5174   else
5175     return vect_lanes_optab_supported_p ("vec_store_lanes",
5176                                          vec_store_lanes_optab,
5177                                          vectype, count);
5178 }
5179
5180
5181 /* Function vect_permute_store_chain.
5182
5183    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5184    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5185    the data correctly for the stores.  Return the final references for stores
5186    in RESULT_CHAIN.
5187
5188    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5189    The input is 4 vectors each containing 8 elements.  We assign a number to
5190    each element, the input sequence is:
5191
5192    1st vec:   0  1  2  3  4  5  6  7
5193    2nd vec:   8  9 10 11 12 13 14 15
5194    3rd vec:  16 17 18 19 20 21 22 23
5195    4th vec:  24 25 26 27 28 29 30 31
5196
5197    The output sequence should be:
5198
5199    1st vec:  0  8 16 24  1  9 17 25
5200    2nd vec:  2 10 18 26  3 11 19 27
5201    3rd vec:  4 12 20 28  5 13 21 30
5202    4th vec:  6 14 22 30  7 15 23 31
5203
5204    i.e., we interleave the contents of the four vectors in their order.
5205
5206    We use interleave_high/low instructions to create such output.  The input of
5207    each interleave_high/low operation is two vectors:
5208    1st vec    2nd vec
5209    0 1 2 3    4 5 6 7
5210    the even elements of the result vector are obtained left-to-right from the
5211    high/low elements of the first vector.  The odd elements of the result are
5212    obtained left-to-right from the high/low elements of the second vector.
5213    The output of interleave_high will be:   0 4 1 5
5214    and of interleave_low:                   2 6 3 7
5215
5216
5217    The permutation is done in log LENGTH stages.  In each stage interleave_high
5218    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5219    where the first argument is taken from the first half of DR_CHAIN and the
5220    second argument from it's second half.
5221    In our example,
5222
5223    I1: interleave_high (1st vec, 3rd vec)
5224    I2: interleave_low (1st vec, 3rd vec)
5225    I3: interleave_high (2nd vec, 4th vec)
5226    I4: interleave_low (2nd vec, 4th vec)
5227
5228    The output for the first stage is:
5229
5230    I1:  0 16  1 17  2 18  3 19
5231    I2:  4 20  5 21  6 22  7 23
5232    I3:  8 24  9 25 10 26 11 27
5233    I4: 12 28 13 29 14 30 15 31
5234
5235    The output of the second stage, i.e. the final result is:
5236
5237    I1:  0  8 16 24  1  9 17 25
5238    I2:  2 10 18 26  3 11 19 27
5239    I3:  4 12 20 28  5 13 21 30
5240    I4:  6 14 22 30  7 15 23 31.  */
5241
5242 void
5243 vect_permute_store_chain (vec<tree> dr_chain,
5244                           unsigned int length,
5245                           gimple *stmt,
5246                           gimple_stmt_iterator *gsi,
5247                           vec<tree> *result_chain)
5248 {
5249   tree vect1, vect2, high, low;
5250   gimple *perm_stmt;
5251   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5252   tree perm_mask_low, perm_mask_high;
5253   tree data_ref;
5254   tree perm3_mask_low, perm3_mask_high;
5255   unsigned int i, j, n, log_length = exact_log2 (length);
5256
5257   result_chain->quick_grow (length);
5258   memcpy (result_chain->address (), dr_chain.address (),
5259           length * sizeof (tree));
5260
5261   if (length == 3)
5262     {
5263       /* vect_grouped_store_supported ensures that this is constant.  */
5264       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5265       unsigned int j0 = 0, j1 = 0, j2 = 0;
5266
5267       vec_perm_builder sel (nelt, nelt, 1);
5268       sel.quick_grow (nelt);
5269       vec_perm_indices indices;
5270       for (j = 0; j < 3; j++)
5271         {
5272           int nelt0 = ((3 - j) * nelt) % 3;
5273           int nelt1 = ((3 - j) * nelt + 1) % 3;
5274           int nelt2 = ((3 - j) * nelt + 2) % 3;
5275
5276           for (i = 0; i < nelt; i++)
5277             {
5278               if (3 * i + nelt0 < nelt)
5279                 sel[3 * i + nelt0] = j0++;
5280               if (3 * i + nelt1 < nelt)
5281                 sel[3 * i + nelt1] = nelt + j1++;
5282               if (3 * i + nelt2 < nelt)
5283                 sel[3 * i + nelt2] = 0;
5284             }
5285           indices.new_vector (sel, 2, nelt);
5286           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5287
5288           for (i = 0; i < nelt; i++)
5289             {
5290               if (3 * i + nelt0 < nelt)
5291                 sel[3 * i + nelt0] = 3 * i + nelt0;
5292               if (3 * i + nelt1 < nelt)
5293                 sel[3 * i + nelt1] = 3 * i + nelt1;
5294               if (3 * i + nelt2 < nelt)
5295                 sel[3 * i + nelt2] = nelt + j2++;
5296             }
5297           indices.new_vector (sel, 2, nelt);
5298           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5299
5300           vect1 = dr_chain[0];
5301           vect2 = dr_chain[1];
5302
5303           /* Create interleaving stmt:
5304              low = VEC_PERM_EXPR <vect1, vect2,
5305                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5306                                    j + 2, nelt + j + 2, *, ...}>  */
5307           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5308           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5309                                            vect2, perm3_mask_low);
5310           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5311
5312           vect1 = data_ref;
5313           vect2 = dr_chain[2];
5314           /* Create interleaving stmt:
5315              low = VEC_PERM_EXPR <vect1, vect2,
5316                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5317                                    6, 7, nelt + j + 2, ...}>  */
5318           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5319           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5320                                            vect2, perm3_mask_high);
5321           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322           (*result_chain)[j] = data_ref;
5323         }
5324     }
5325   else
5326     {
5327       /* If length is not equal to 3 then only power of 2 is supported.  */
5328       gcc_assert (pow2p_hwi (length));
5329
5330       /* The encoding has 2 interleaved stepped patterns.  */
5331       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5332       vec_perm_builder sel (nelt, 2, 3);
5333       sel.quick_grow (6);
5334       for (i = 0; i < 3; i++)
5335         {
5336           sel[i * 2] = i;
5337           sel[i * 2 + 1] = i + nelt;
5338         }
5339         vec_perm_indices indices (sel, 2, nelt);
5340         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5341
5342         for (i = 0; i < 6; i++)
5343           sel[i] += exact_div (nelt, 2);
5344         indices.new_vector (sel, 2, nelt);
5345         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5346
5347         for (i = 0, n = log_length; i < n; i++)
5348           {
5349             for (j = 0; j < length/2; j++)
5350               {
5351                 vect1 = dr_chain[j];
5352                 vect2 = dr_chain[j+length/2];
5353
5354                 /* Create interleaving stmt:
5355                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5356                                                         ...}>  */
5357                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5358                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5359                                                  vect2, perm_mask_high);
5360                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5361                 (*result_chain)[2*j] = high;
5362
5363                 /* Create interleaving stmt:
5364                    low = VEC_PERM_EXPR <vect1, vect2,
5365                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5366                                          ...}>  */
5367                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5368                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5369                                                  vect2, perm_mask_low);
5370                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371                 (*result_chain)[2*j+1] = low;
5372               }
5373             memcpy (dr_chain.address (), result_chain->address (),
5374                     length * sizeof (tree));
5375           }
5376     }
5377 }
5378
5379 /* Function vect_setup_realignment
5380
5381    This function is called when vectorizing an unaligned load using
5382    the dr_explicit_realign[_optimized] scheme.
5383    This function generates the following code at the loop prolog:
5384
5385       p = initial_addr;
5386    x  msq_init = *(floor(p));   # prolog load
5387       realignment_token = call target_builtin;
5388     loop:
5389    x  msq = phi (msq_init, ---)
5390
5391    The stmts marked with x are generated only for the case of
5392    dr_explicit_realign_optimized.
5393
5394    The code above sets up a new (vector) pointer, pointing to the first
5395    location accessed by STMT, and a "floor-aligned" load using that pointer.
5396    It also generates code to compute the "realignment-token" (if the relevant
5397    target hook was defined), and creates a phi-node at the loop-header bb
5398    whose arguments are the result of the prolog-load (created by this
5399    function) and the result of a load that takes place in the loop (to be
5400    created by the caller to this function).
5401
5402    For the case of dr_explicit_realign_optimized:
5403    The caller to this function uses the phi-result (msq) to create the
5404    realignment code inside the loop, and sets up the missing phi argument,
5405    as follows:
5406     loop:
5407       msq = phi (msq_init, lsq)
5408       lsq = *(floor(p'));        # load in loop
5409       result = realign_load (msq, lsq, realignment_token);
5410
5411    For the case of dr_explicit_realign:
5412     loop:
5413       msq = *(floor(p));        # load in loop
5414       p' = p + (VS-1);
5415       lsq = *(floor(p'));       # load in loop
5416       result = realign_load (msq, lsq, realignment_token);
5417
5418    Input:
5419    STMT - (scalar) load stmt to be vectorized. This load accesses
5420           a memory location that may be unaligned.
5421    BSI - place where new code is to be inserted.
5422    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5423                               is used.
5424
5425    Output:
5426    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5427                        target hook, if defined.
5428    Return value - the result of the loop-header phi node.  */
5429
5430 tree
5431 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5432                         tree *realignment_token,
5433                         enum dr_alignment_support alignment_support_scheme,
5434                         tree init_addr,
5435                         struct loop **at_loop)
5436 {
5437   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5438   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5439   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5440   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5441   struct loop *loop = NULL;
5442   edge pe = NULL;
5443   tree scalar_dest = gimple_assign_lhs (stmt);
5444   tree vec_dest;
5445   gimple *inc;
5446   tree ptr;
5447   tree data_ref;
5448   basic_block new_bb;
5449   tree msq_init = NULL_TREE;
5450   tree new_temp;
5451   gphi *phi_stmt;
5452   tree msq = NULL_TREE;
5453   gimple_seq stmts = NULL;
5454   bool inv_p;
5455   bool compute_in_loop = false;
5456   bool nested_in_vect_loop = false;
5457   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5458   struct loop *loop_for_initial_load = NULL;
5459
5460   if (loop_vinfo)
5461     {
5462       loop = LOOP_VINFO_LOOP (loop_vinfo);
5463       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5464     }
5465
5466   gcc_assert (alignment_support_scheme == dr_explicit_realign
5467               || alignment_support_scheme == dr_explicit_realign_optimized);
5468
5469   /* We need to generate three things:
5470      1. the misalignment computation
5471      2. the extra vector load (for the optimized realignment scheme).
5472      3. the phi node for the two vectors from which the realignment is
5473       done (for the optimized realignment scheme).  */
5474
5475   /* 1. Determine where to generate the misalignment computation.
5476
5477      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5478      calculation will be generated by this function, outside the loop (in the
5479      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5480      caller, inside the loop.
5481
5482      Background: If the misalignment remains fixed throughout the iterations of
5483      the loop, then both realignment schemes are applicable, and also the
5484      misalignment computation can be done outside LOOP.  This is because we are
5485      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5486      are a multiple of VS (the Vector Size), and therefore the misalignment in
5487      different vectorized LOOP iterations is always the same.
5488      The problem arises only if the memory access is in an inner-loop nested
5489      inside LOOP, which is now being vectorized using outer-loop vectorization.
5490      This is the only case when the misalignment of the memory access may not
5491      remain fixed throughout the iterations of the inner-loop (as explained in
5492      detail in vect_supportable_dr_alignment).  In this case, not only is the
5493      optimized realignment scheme not applicable, but also the misalignment
5494      computation (and generation of the realignment token that is passed to
5495      REALIGN_LOAD) have to be done inside the loop.
5496
5497      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5498      or not, which in turn determines if the misalignment is computed inside
5499      the inner-loop, or outside LOOP.  */
5500
5501   if (init_addr != NULL_TREE || !loop_vinfo)
5502     {
5503       compute_in_loop = true;
5504       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5505     }
5506
5507
5508   /* 2. Determine where to generate the extra vector load.
5509
5510      For the optimized realignment scheme, instead of generating two vector
5511      loads in each iteration, we generate a single extra vector load in the
5512      preheader of the loop, and in each iteration reuse the result of the
5513      vector load from the previous iteration.  In case the memory access is in
5514      an inner-loop nested inside LOOP, which is now being vectorized using
5515      outer-loop vectorization, we need to determine whether this initial vector
5516      load should be generated at the preheader of the inner-loop, or can be
5517      generated at the preheader of LOOP.  If the memory access has no evolution
5518      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5519      to be generated inside LOOP (in the preheader of the inner-loop).  */
5520
5521   if (nested_in_vect_loop)
5522     {
5523       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5524       bool invariant_in_outerloop =
5525             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5526       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5527     }
5528   else
5529     loop_for_initial_load = loop;
5530   if (at_loop)
5531     *at_loop = loop_for_initial_load;
5532
5533   if (loop_for_initial_load)
5534     pe = loop_preheader_edge (loop_for_initial_load);
5535
5536   /* 3. For the case of the optimized realignment, create the first vector
5537       load at the loop preheader.  */
5538
5539   if (alignment_support_scheme == dr_explicit_realign_optimized)
5540     {
5541       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5542       gassign *new_stmt;
5543
5544       gcc_assert (!compute_in_loop);
5545       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5546       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5547                                       NULL_TREE, &init_addr, NULL, &inc,
5548                                       true, &inv_p);
5549       if (TREE_CODE (ptr) == SSA_NAME)
5550         new_temp = copy_ssa_name (ptr);
5551       else
5552         new_temp = make_ssa_name (TREE_TYPE (ptr));
5553       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5554       new_stmt = gimple_build_assign
5555                    (new_temp, BIT_AND_EXPR, ptr,
5556                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5557       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5558       gcc_assert (!new_bb);
5559       data_ref
5560         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5561                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5562       vect_copy_ref_info (data_ref, DR_REF (dr));
5563       new_stmt = gimple_build_assign (vec_dest, data_ref);
5564       new_temp = make_ssa_name (vec_dest, new_stmt);
5565       gimple_assign_set_lhs (new_stmt, new_temp);
5566       if (pe)
5567         {
5568           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5569           gcc_assert (!new_bb);
5570         }
5571       else
5572          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5573
5574       msq_init = gimple_assign_lhs (new_stmt);
5575     }
5576
5577   /* 4. Create realignment token using a target builtin, if available.
5578       It is done either inside the containing loop, or before LOOP (as
5579       determined above).  */
5580
5581   if (targetm.vectorize.builtin_mask_for_load)
5582     {
5583       gcall *new_stmt;
5584       tree builtin_decl;
5585
5586       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5587       if (!init_addr)
5588         {
5589           /* Generate the INIT_ADDR computation outside LOOP.  */
5590           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5591                                                             NULL_TREE);
5592           if (loop)
5593             {
5594               pe = loop_preheader_edge (loop);
5595               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5596               gcc_assert (!new_bb);
5597             }
5598           else
5599              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5600         }
5601
5602       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5603       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5604       vec_dest =
5605         vect_create_destination_var (scalar_dest,
5606                                      gimple_call_return_type (new_stmt));
5607       new_temp = make_ssa_name (vec_dest, new_stmt);
5608       gimple_call_set_lhs (new_stmt, new_temp);
5609
5610       if (compute_in_loop)
5611         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5612       else
5613         {
5614           /* Generate the misalignment computation outside LOOP.  */
5615           pe = loop_preheader_edge (loop);
5616           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5617           gcc_assert (!new_bb);
5618         }
5619
5620       *realignment_token = gimple_call_lhs (new_stmt);
5621
5622       /* The result of the CALL_EXPR to this builtin is determined from
5623          the value of the parameter and no global variables are touched
5624          which makes the builtin a "const" function.  Requiring the
5625          builtin to have the "const" attribute makes it unnecessary
5626          to call mark_call_clobbered.  */
5627       gcc_assert (TREE_READONLY (builtin_decl));
5628     }
5629
5630   if (alignment_support_scheme == dr_explicit_realign)
5631     return msq;
5632
5633   gcc_assert (!compute_in_loop);
5634   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5635
5636
5637   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5638
5639   pe = loop_preheader_edge (containing_loop);
5640   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5641   msq = make_ssa_name (vec_dest);
5642   phi_stmt = create_phi_node (msq, containing_loop->header);
5643   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5644
5645   return msq;
5646 }
5647
5648
5649 /* Function vect_grouped_load_supported.
5650
5651    COUNT is the size of the load group (the number of statements plus the
5652    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5653    only one statement, with a gap of COUNT - 1.
5654
5655    Returns true if a suitable permute exists.  */
5656
5657 bool
5658 vect_grouped_load_supported (tree vectype, bool single_element_p,
5659                              unsigned HOST_WIDE_INT count)
5660 {
5661   machine_mode mode = TYPE_MODE (vectype);
5662
5663   /* If this is single-element interleaving with an element distance
5664      that leaves unused vector loads around punt - we at least create
5665      very sub-optimal code in that case (and blow up memory,
5666      see PR65518).  */
5667   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5668     {
5669       if (dump_enabled_p ())
5670         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5671                          "single-element interleaving not supported "
5672                          "for not adjacent vector loads\n");
5673       return false;
5674     }
5675
5676   /* vect_permute_load_chain requires the group size to be equal to 3 or
5677      be a power of two.  */
5678   if (count != 3 && exact_log2 (count) == -1)
5679     {
5680       if (dump_enabled_p ())
5681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682                          "the size of the group of accesses"
5683                          " is not a power of 2 or not equal to 3\n");
5684       return false;
5685     }
5686
5687   /* Check that the permutation is supported.  */
5688   if (VECTOR_MODE_P (mode))
5689     {
5690       unsigned int i, j;
5691       if (count == 3)
5692         {
5693           unsigned int nelt;
5694           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5695             {
5696               if (dump_enabled_p ())
5697                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698                                  "cannot handle groups of 3 loads for"
5699                                  " variable-length vectors\n");
5700               return false;
5701             }
5702
5703           vec_perm_builder sel (nelt, nelt, 1);
5704           sel.quick_grow (nelt);
5705           vec_perm_indices indices;
5706           unsigned int k;
5707           for (k = 0; k < 3; k++)
5708             {
5709               for (i = 0; i < nelt; i++)
5710                 if (3 * i + k < 2 * nelt)
5711                   sel[i] = 3 * i + k;
5712                 else
5713                   sel[i] = 0;
5714               indices.new_vector (sel, 2, nelt);
5715               if (!can_vec_perm_const_p (mode, indices))
5716                 {
5717                   if (dump_enabled_p ())
5718                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5719                                      "shuffle of 3 loads is not supported by"
5720                                      " target\n");
5721                   return false;
5722                 }
5723               for (i = 0, j = 0; i < nelt; i++)
5724                 if (3 * i + k < 2 * nelt)
5725                   sel[i] = i;
5726                 else
5727                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5728               indices.new_vector (sel, 2, nelt);
5729               if (!can_vec_perm_const_p (mode, indices))
5730                 {
5731                   if (dump_enabled_p ())
5732                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733                                      "shuffle of 3 loads is not supported by"
5734                                      " target\n");
5735                   return false;
5736                 }
5737             }
5738           return true;
5739         }
5740       else
5741         {
5742           /* If length is not equal to 3 then only power of 2 is supported.  */
5743           gcc_assert (pow2p_hwi (count));
5744           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5745
5746           /* The encoding has a single stepped pattern.  */
5747           vec_perm_builder sel (nelt, 1, 3);
5748           sel.quick_grow (3);
5749           for (i = 0; i < 3; i++)
5750             sel[i] = i * 2;
5751           vec_perm_indices indices (sel, 2, nelt);
5752           if (can_vec_perm_const_p (mode, indices))
5753             {
5754               for (i = 0; i < 3; i++)
5755                 sel[i] = i * 2 + 1;
5756               indices.new_vector (sel, 2, nelt);
5757               if (can_vec_perm_const_p (mode, indices))
5758                 return true;
5759             }
5760         }
5761     }
5762
5763   if (dump_enabled_p ())
5764     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5765                      "extract even/odd not supported by target\n");
5766   return false;
5767 }
5768
5769 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5770    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5771
5772 bool
5773 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5774                            bool masked_p)
5775 {
5776   if (masked_p)
5777     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5778                                          vec_mask_load_lanes_optab,
5779                                          vectype, count);
5780   else
5781     return vect_lanes_optab_supported_p ("vec_load_lanes",
5782                                          vec_load_lanes_optab,
5783                                          vectype, count);
5784 }
5785
5786 /* Function vect_permute_load_chain.
5787
5788    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5789    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5790    the input data correctly.  Return the final references for loads in
5791    RESULT_CHAIN.
5792
5793    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5794    The input is 4 vectors each containing 8 elements. We assign a number to each
5795    element, the input sequence is:
5796
5797    1st vec:   0  1  2  3  4  5  6  7
5798    2nd vec:   8  9 10 11 12 13 14 15
5799    3rd vec:  16 17 18 19 20 21 22 23
5800    4th vec:  24 25 26 27 28 29 30 31
5801
5802    The output sequence should be:
5803
5804    1st vec:  0 4  8 12 16 20 24 28
5805    2nd vec:  1 5  9 13 17 21 25 29
5806    3rd vec:  2 6 10 14 18 22 26 30
5807    4th vec:  3 7 11 15 19 23 27 31
5808
5809    i.e., the first output vector should contain the first elements of each
5810    interleaving group, etc.
5811
5812    We use extract_even/odd instructions to create such output.  The input of
5813    each extract_even/odd operation is two vectors
5814    1st vec    2nd vec
5815    0 1 2 3    4 5 6 7
5816
5817    and the output is the vector of extracted even/odd elements.  The output of
5818    extract_even will be:   0 2 4 6
5819    and of extract_odd:     1 3 5 7
5820
5821
5822    The permutation is done in log LENGTH stages.  In each stage extract_even
5823    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5824    their order.  In our example,
5825
5826    E1: extract_even (1st vec, 2nd vec)
5827    E2: extract_odd (1st vec, 2nd vec)
5828    E3: extract_even (3rd vec, 4th vec)
5829    E4: extract_odd (3rd vec, 4th vec)
5830
5831    The output for the first stage will be:
5832
5833    E1:  0  2  4  6  8 10 12 14
5834    E2:  1  3  5  7  9 11 13 15
5835    E3: 16 18 20 22 24 26 28 30
5836    E4: 17 19 21 23 25 27 29 31
5837
5838    In order to proceed and create the correct sequence for the next stage (or
5839    for the correct output, if the second stage is the last one, as in our
5840    example), we first put the output of extract_even operation and then the
5841    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5842    The input for the second stage is:
5843
5844    1st vec (E1):  0  2  4  6  8 10 12 14
5845    2nd vec (E3): 16 18 20 22 24 26 28 30
5846    3rd vec (E2):  1  3  5  7  9 11 13 15
5847    4th vec (E4): 17 19 21 23 25 27 29 31
5848
5849    The output of the second stage:
5850
5851    E1: 0 4  8 12 16 20 24 28
5852    E2: 2 6 10 14 18 22 26 30
5853    E3: 1 5  9 13 17 21 25 29
5854    E4: 3 7 11 15 19 23 27 31
5855
5856    And RESULT_CHAIN after reordering:
5857
5858    1st vec (E1):  0 4  8 12 16 20 24 28
5859    2nd vec (E3):  1 5  9 13 17 21 25 29
5860    3rd vec (E2):  2 6 10 14 18 22 26 30
5861    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5862
5863 static void
5864 vect_permute_load_chain (vec<tree> dr_chain,
5865                          unsigned int length,
5866                          gimple *stmt,
5867                          gimple_stmt_iterator *gsi,
5868                          vec<tree> *result_chain)
5869 {
5870   tree data_ref, first_vect, second_vect;
5871   tree perm_mask_even, perm_mask_odd;
5872   tree perm3_mask_low, perm3_mask_high;
5873   gimple *perm_stmt;
5874   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5875   unsigned int i, j, log_length = exact_log2 (length);
5876
5877   result_chain->quick_grow (length);
5878   memcpy (result_chain->address (), dr_chain.address (),
5879           length * sizeof (tree));
5880
5881   if (length == 3)
5882     {
5883       /* vect_grouped_load_supported ensures that this is constant.  */
5884       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5885       unsigned int k;
5886
5887       vec_perm_builder sel (nelt, nelt, 1);
5888       sel.quick_grow (nelt);
5889       vec_perm_indices indices;
5890       for (k = 0; k < 3; k++)
5891         {
5892           for (i = 0; i < nelt; i++)
5893             if (3 * i + k < 2 * nelt)
5894               sel[i] = 3 * i + k;
5895             else
5896               sel[i] = 0;
5897           indices.new_vector (sel, 2, nelt);
5898           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5899
5900           for (i = 0, j = 0; i < nelt; i++)
5901             if (3 * i + k < 2 * nelt)
5902               sel[i] = i;
5903             else
5904               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5905           indices.new_vector (sel, 2, nelt);
5906           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5907
5908           first_vect = dr_chain[0];
5909           second_vect = dr_chain[1];
5910
5911           /* Create interleaving stmt (low part of):
5912              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5913                                                              ...}>  */
5914           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5915           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5916                                            second_vect, perm3_mask_low);
5917           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5918
5919           /* Create interleaving stmt (high part of):
5920              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921                                                               ...}>  */
5922           first_vect = data_ref;
5923           second_vect = dr_chain[2];
5924           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5925           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5926                                            second_vect, perm3_mask_high);
5927           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5928           (*result_chain)[k] = data_ref;
5929         }
5930     }
5931   else
5932     {
5933       /* If length is not equal to 3 then only power of 2 is supported.  */
5934       gcc_assert (pow2p_hwi (length));
5935
5936       /* The encoding has a single stepped pattern.  */
5937       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5938       vec_perm_builder sel (nelt, 1, 3);
5939       sel.quick_grow (3);
5940       for (i = 0; i < 3; ++i)
5941         sel[i] = i * 2;
5942       vec_perm_indices indices (sel, 2, nelt);
5943       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5944
5945       for (i = 0; i < 3; ++i)
5946         sel[i] = i * 2 + 1;
5947       indices.new_vector (sel, 2, nelt);
5948       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5949
5950       for (i = 0; i < log_length; i++)
5951         {
5952           for (j = 0; j < length; j += 2)
5953             {
5954               first_vect = dr_chain[j];
5955               second_vect = dr_chain[j+1];
5956
5957               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5958               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5959               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5960                                                first_vect, second_vect,
5961                                                perm_mask_even);
5962               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5963               (*result_chain)[j/2] = data_ref;
5964
5965               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5966               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5967               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5968                                                first_vect, second_vect,
5969                                                perm_mask_odd);
5970               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5971               (*result_chain)[j/2+length/2] = data_ref;
5972             }
5973           memcpy (dr_chain.address (), result_chain->address (),
5974                   length * sizeof (tree));
5975         }
5976     }
5977 }
5978
5979 /* Function vect_shift_permute_load_chain.
5980
5981    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5982    sequence of stmts to reorder the input data accordingly.
5983    Return the final references for loads in RESULT_CHAIN.
5984    Return true if successed, false otherwise.
5985
5986    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5987    The input is 3 vectors each containing 8 elements.  We assign a
5988    number to each element, the input sequence is:
5989
5990    1st vec:   0  1  2  3  4  5  6  7
5991    2nd vec:   8  9 10 11 12 13 14 15
5992    3rd vec:  16 17 18 19 20 21 22 23
5993
5994    The output sequence should be:
5995
5996    1st vec:  0 3 6  9 12 15 18 21
5997    2nd vec:  1 4 7 10 13 16 19 22
5998    3rd vec:  2 5 8 11 14 17 20 23
5999
6000    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6001
6002    First we shuffle all 3 vectors to get correct elements order:
6003
6004    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6005    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6006    3rd vec:  (16 19 22) (17 20 23) (18 21)
6007
6008    Next we unite and shift vector 3 times:
6009
6010    1st step:
6011      shift right by 6 the concatenation of:
6012      "1st vec" and  "2nd vec"
6013        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6014      "2nd vec" and  "3rd vec"
6015        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6016      "3rd vec" and  "1st vec"
6017        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6018                              | New vectors                   |
6019
6020      So that now new vectors are:
6021
6022      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6023      2nd vec:  (10 13) (16 19 22) (17 20 23)
6024      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6025
6026    2nd step:
6027      shift right by 5 the concatenation of:
6028      "1st vec" and  "3rd vec"
6029        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6030      "2nd vec" and  "1st vec"
6031        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6032      "3rd vec" and  "2nd vec"
6033        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6034                           | New vectors                   |
6035
6036      So that now new vectors are:
6037
6038      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6039      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6040      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6041
6042    3rd step:
6043      shift right by 5 the concatenation of:
6044      "1st vec" and  "1st vec"
6045        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6046      shift right by 3 the concatenation of:
6047      "2nd vec" and  "2nd vec"
6048                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6049                           | New vectors                   |
6050
6051      So that now all vectors are READY:
6052      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6053      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6054      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6055
6056    This algorithm is faster than one in vect_permute_load_chain if:
6057      1.  "shift of a concatination" is faster than general permutation.
6058          This is usually so.
6059      2.  The TARGET machine can't execute vector instructions in parallel.
6060          This is because each step of the algorithm depends on previous.
6061          The algorithm in vect_permute_load_chain is much more parallel.
6062
6063    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6064 */
6065
6066 static bool
6067 vect_shift_permute_load_chain (vec<tree> dr_chain,
6068                                unsigned int length,
6069                                gimple *stmt,
6070                                gimple_stmt_iterator *gsi,
6071                                vec<tree> *result_chain)
6072 {
6073   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6074   tree perm2_mask1, perm2_mask2, perm3_mask;
6075   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6076   gimple *perm_stmt;
6077
6078   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6079   unsigned int i;
6080   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6081   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6082
6083   unsigned HOST_WIDE_INT nelt, vf;
6084   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6085       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6086     /* Not supported for variable-length vectors.  */
6087     return false;
6088
6089   vec_perm_builder sel (nelt, nelt, 1);
6090   sel.quick_grow (nelt);
6091
6092   result_chain->quick_grow (length);
6093   memcpy (result_chain->address (), dr_chain.address (),
6094           length * sizeof (tree));
6095
6096   if (pow2p_hwi (length) && vf > 4)
6097     {
6098       unsigned int j, log_length = exact_log2 (length);
6099       for (i = 0; i < nelt / 2; ++i)
6100         sel[i] = i * 2;
6101       for (i = 0; i < nelt / 2; ++i)
6102         sel[nelt / 2 + i] = i * 2 + 1;
6103       vec_perm_indices indices (sel, 2, nelt);
6104       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6105         {
6106           if (dump_enabled_p ())
6107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6108                              "shuffle of 2 fields structure is not \
6109                               supported by target\n");
6110           return false;
6111         }
6112       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6113
6114       for (i = 0; i < nelt / 2; ++i)
6115         sel[i] = i * 2 + 1;
6116       for (i = 0; i < nelt / 2; ++i)
6117         sel[nelt / 2 + i] = i * 2;
6118       indices.new_vector (sel, 2, nelt);
6119       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6120         {
6121           if (dump_enabled_p ())
6122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6123                              "shuffle of 2 fields structure is not \
6124                               supported by target\n");
6125           return false;
6126         }
6127       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6128
6129       /* Generating permutation constant to shift all elements.
6130          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6131       for (i = 0; i < nelt; i++)
6132         sel[i] = nelt / 2 + i;
6133       indices.new_vector (sel, 2, nelt);
6134       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6135         {
6136           if (dump_enabled_p ())
6137             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6138                              "shift permutation is not supported by target\n");
6139           return false;
6140         }
6141       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6142
6143       /* Generating permutation constant to select vector from 2.
6144          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6145       for (i = 0; i < nelt / 2; i++)
6146         sel[i] = i;
6147       for (i = nelt / 2; i < nelt; i++)
6148         sel[i] = nelt + i;
6149       indices.new_vector (sel, 2, nelt);
6150       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6151         {
6152           if (dump_enabled_p ())
6153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6154                              "select is not supported by target\n");
6155           return false;
6156         }
6157       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6158
6159       for (i = 0; i < log_length; i++)
6160         {
6161           for (j = 0; j < length; j += 2)
6162             {
6163               first_vect = dr_chain[j];
6164               second_vect = dr_chain[j + 1];
6165
6166               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168                                                first_vect, first_vect,
6169                                                perm2_mask1);
6170               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171               vect[0] = data_ref;
6172
6173               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6174               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175                                                second_vect, second_vect,
6176                                                perm2_mask2);
6177               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6178               vect[1] = data_ref;
6179
6180               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6181               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6182                                                vect[0], vect[1], shift1_mask);
6183               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6184               (*result_chain)[j/2 + length/2] = data_ref;
6185
6186               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6187               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6188                                                vect[0], vect[1], select_mask);
6189               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6190               (*result_chain)[j/2] = data_ref;
6191             }
6192           memcpy (dr_chain.address (), result_chain->address (),
6193                   length * sizeof (tree));
6194         }
6195       return true;
6196     }
6197   if (length == 3 && vf > 2)
6198     {
6199       unsigned int k = 0, l = 0;
6200
6201       /* Generating permutation constant to get all elements in rigth order.
6202          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6203       for (i = 0; i < nelt; i++)
6204         {
6205           if (3 * k + (l % 3) >= nelt)
6206             {
6207               k = 0;
6208               l += (3 - (nelt % 3));
6209             }
6210           sel[i] = 3 * k + (l % 3);
6211           k++;
6212         }
6213       vec_perm_indices indices (sel, 2, nelt);
6214       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6215         {
6216           if (dump_enabled_p ())
6217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6218                              "shuffle of 3 fields structure is not \
6219                               supported by target\n");
6220           return false;
6221         }
6222       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6223
6224       /* Generating permutation constant to shift all elements.
6225          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6226       for (i = 0; i < nelt; i++)
6227         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6228       indices.new_vector (sel, 2, nelt);
6229       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6230         {
6231           if (dump_enabled_p ())
6232             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6233                              "shift permutation is not supported by target\n");
6234           return false;
6235         }
6236       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6237
6238       /* Generating permutation constant to shift all elements.
6239          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6240       for (i = 0; i < nelt; i++)
6241         sel[i] = 2 * (nelt / 3) + 1 + i;
6242       indices.new_vector (sel, 2, nelt);
6243       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6244         {
6245           if (dump_enabled_p ())
6246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247                              "shift permutation is not supported by target\n");
6248           return false;
6249         }
6250       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6251
6252       /* Generating permutation constant to shift all elements.
6253          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6254       for (i = 0; i < nelt; i++)
6255         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6256       indices.new_vector (sel, 2, nelt);
6257       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6258         {
6259           if (dump_enabled_p ())
6260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6261                              "shift permutation is not supported by target\n");
6262           return false;
6263         }
6264       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6265
6266       /* Generating permutation constant to shift all elements.
6267          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6268       for (i = 0; i < nelt; i++)
6269         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6270       indices.new_vector (sel, 2, nelt);
6271       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6272         {
6273           if (dump_enabled_p ())
6274             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6275                              "shift permutation is not supported by target\n");
6276           return false;
6277         }
6278       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6279
6280       for (k = 0; k < 3; k++)
6281         {
6282           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6283           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6284                                            dr_chain[k], dr_chain[k],
6285                                            perm3_mask);
6286           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6287           vect[k] = data_ref;
6288         }
6289
6290       for (k = 0; k < 3; k++)
6291         {
6292           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6293           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6294                                            vect[k % 3], vect[(k + 1) % 3],
6295                                            shift1_mask);
6296           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6297           vect_shift[k] = data_ref;
6298         }
6299
6300       for (k = 0; k < 3; k++)
6301         {
6302           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6303           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6304                                            vect_shift[(4 - k) % 3],
6305                                            vect_shift[(3 - k) % 3],
6306                                            shift2_mask);
6307           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6308           vect[k] = data_ref;
6309         }
6310
6311       (*result_chain)[3 - (nelt % 3)] = vect[2];
6312
6313       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6314       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6315                                        vect[0], shift3_mask);
6316       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317       (*result_chain)[nelt % 3] = data_ref;
6318
6319       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6320       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6321                                        vect[1], shift4_mask);
6322       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6323       (*result_chain)[0] = data_ref;
6324       return true;
6325     }
6326   return false;
6327 }
6328
6329 /* Function vect_transform_grouped_load.
6330
6331    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6332    to perform their permutation and ascribe the result vectorized statements to
6333    the scalar statements.
6334 */
6335
6336 void
6337 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6338                              gimple_stmt_iterator *gsi)
6339 {
6340   machine_mode mode;
6341   vec<tree> result_chain = vNULL;
6342
6343   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6344      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6345      vectors, that are ready for vector computation.  */
6346   result_chain.create (size);
6347
6348   /* If reassociation width for vector type is 2 or greater target machine can
6349      execute 2 or more vector instructions in parallel.  Otherwise try to
6350      get chain for loads group using vect_shift_permute_load_chain.  */
6351   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6352   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6353       || pow2p_hwi (size)
6354       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6355                                          gsi, &result_chain))
6356     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6357   vect_record_grouped_load_vectors (stmt, result_chain);
6358   result_chain.release ();
6359 }
6360
6361 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6362    generated as part of the vectorization of STMT.  Assign the statement
6363    for each vector to the associated scalar statement.  */
6364
6365 void
6366 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6367 {
6368   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6369   gimple *next_stmt, *new_stmt;
6370   unsigned int i, gap_count;
6371   tree tmp_data_ref;
6372
6373   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6374      Since we scan the chain starting from it's first node, their order
6375      corresponds the order of data-refs in RESULT_CHAIN.  */
6376   next_stmt = first_stmt;
6377   gap_count = 1;
6378   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6379     {
6380       if (!next_stmt)
6381         break;
6382
6383       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6384        code elimination pass later.  No need to check for the first stmt in
6385        the group, since it always exists.
6386        DR_GROUP_GAP is the number of steps in elements from the previous
6387        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6388        correspond to the gaps.  */
6389       if (next_stmt != first_stmt
6390           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6391       {
6392         gap_count++;
6393         continue;
6394       }
6395
6396       while (next_stmt)
6397         {
6398           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6399           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6400              copies, and we put the new vector statement in the first available
6401              RELATED_STMT.  */
6402           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6403             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6404           else
6405             {
6406               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6407                 {
6408                   gimple *prev_stmt =
6409                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6410                   gimple *rel_stmt =
6411                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6412                   while (rel_stmt)
6413                     {
6414                       prev_stmt = rel_stmt;
6415                       rel_stmt =
6416                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6417                     }
6418
6419                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6420                     new_stmt;
6421                 }
6422             }
6423
6424           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6425           gap_count = 1;
6426           /* If NEXT_STMT accesses the same DR as the previous statement,
6427              put the same TMP_DATA_REF as its vectorized statement; otherwise
6428              get the next data-ref from RESULT_CHAIN.  */
6429           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6430             break;
6431         }
6432     }
6433 }
6434
6435 /* Function vect_force_dr_alignment_p.
6436
6437    Returns whether the alignment of a DECL can be forced to be aligned
6438    on ALIGNMENT bit boundary.  */
6439
6440 bool
6441 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6442 {
6443   if (!VAR_P (decl))
6444     return false;
6445
6446   if (decl_in_symtab_p (decl)
6447       && !symtab_node::get (decl)->can_increase_alignment_p ())
6448     return false;
6449
6450   if (TREE_STATIC (decl))
6451     return (alignment <= MAX_OFILE_ALIGNMENT);
6452   else
6453     return (alignment <= MAX_STACK_ALIGNMENT);
6454 }
6455
6456
6457 /* Return whether the data reference DR is supported with respect to its
6458    alignment.
6459    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6460    it is aligned, i.e., check if it is possible to vectorize it with different
6461    alignment.  */
6462
6463 enum dr_alignment_support
6464 vect_supportable_dr_alignment (struct data_reference *dr,
6465                                bool check_aligned_accesses)
6466 {
6467   gimple *stmt = DR_STMT (dr);
6468   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6469   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6470   machine_mode mode = TYPE_MODE (vectype);
6471   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6472   struct loop *vect_loop = NULL;
6473   bool nested_in_vect_loop = false;
6474
6475   if (aligned_access_p (dr) && !check_aligned_accesses)
6476     return dr_aligned;
6477
6478   /* For now assume all conditional loads/stores support unaligned
6479      access without any special code.  */
6480   if (is_gimple_call (stmt)
6481       && gimple_call_internal_p (stmt)
6482       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6483           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6484     return dr_unaligned_supported;
6485
6486   if (loop_vinfo)
6487     {
6488       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6489       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6490     }
6491
6492   /* Possibly unaligned access.  */
6493
6494   /* We can choose between using the implicit realignment scheme (generating
6495      a misaligned_move stmt) and the explicit realignment scheme (generating
6496      aligned loads with a REALIGN_LOAD).  There are two variants to the
6497      explicit realignment scheme: optimized, and unoptimized.
6498      We can optimize the realignment only if the step between consecutive
6499      vector loads is equal to the vector size.  Since the vector memory
6500      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6501      is guaranteed that the misalignment amount remains the same throughout the
6502      execution of the vectorized loop.  Therefore, we can create the
6503      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6504      at the loop preheader.
6505
6506      However, in the case of outer-loop vectorization, when vectorizing a
6507      memory access in the inner-loop nested within the LOOP that is now being
6508      vectorized, while it is guaranteed that the misalignment of the
6509      vectorized memory access will remain the same in different outer-loop
6510      iterations, it is *not* guaranteed that is will remain the same throughout
6511      the execution of the inner-loop.  This is because the inner-loop advances
6512      with the original scalar step (and not in steps of VS).  If the inner-loop
6513      step happens to be a multiple of VS, then the misalignment remains fixed
6514      and we can use the optimized realignment scheme.  For example:
6515
6516       for (i=0; i<N; i++)
6517         for (j=0; j<M; j++)
6518           s += a[i+j];
6519
6520      When vectorizing the i-loop in the above example, the step between
6521      consecutive vector loads is 1, and so the misalignment does not remain
6522      fixed across the execution of the inner-loop, and the realignment cannot
6523      be optimized (as illustrated in the following pseudo vectorized loop):
6524
6525       for (i=0; i<N; i+=4)
6526         for (j=0; j<M; j++){
6527           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6528                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6529                          // (assuming that we start from an aligned address).
6530           }
6531
6532      We therefore have to use the unoptimized realignment scheme:
6533
6534       for (i=0; i<N; i+=4)
6535           for (j=k; j<M; j+=4)
6536           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6537                            // that the misalignment of the initial address is
6538                            // 0).
6539
6540      The loop can then be vectorized as follows:
6541
6542       for (k=0; k<4; k++){
6543         rt = get_realignment_token (&vp[k]);
6544         for (i=0; i<N; i+=4){
6545           v1 = vp[i+k];
6546           for (j=k; j<M; j+=4){
6547             v2 = vp[i+j+VS-1];
6548             va = REALIGN_LOAD <v1,v2,rt>;
6549             vs += va;
6550             v1 = v2;
6551           }
6552         }
6553     } */
6554
6555   if (DR_IS_READ (dr))
6556     {
6557       bool is_packed = false;
6558       tree type = (TREE_TYPE (DR_REF (dr)));
6559
6560       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6561           && (!targetm.vectorize.builtin_mask_for_load
6562               || targetm.vectorize.builtin_mask_for_load ()))
6563         {
6564           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6565
6566           /* If we are doing SLP then the accesses need not have the
6567              same alignment, instead it depends on the SLP group size.  */
6568           if (loop_vinfo
6569               && STMT_SLP_TYPE (stmt_info)
6570               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6571                               * DR_GROUP_SIZE (vinfo_for_stmt
6572                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6573                               TYPE_VECTOR_SUBPARTS (vectype)))
6574             ;
6575           else if (!loop_vinfo
6576                    || (nested_in_vect_loop
6577                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6578                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6579             return dr_explicit_realign;
6580           else
6581             return dr_explicit_realign_optimized;
6582         }
6583       if (!known_alignment_for_access_p (dr))
6584         is_packed = not_size_aligned (DR_REF (dr));
6585
6586       if (targetm.vectorize.support_vector_misalignment
6587             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6588         /* Can't software pipeline the loads, but can at least do them.  */
6589         return dr_unaligned_supported;
6590     }
6591   else
6592     {
6593       bool is_packed = false;
6594       tree type = (TREE_TYPE (DR_REF (dr)));
6595
6596       if (!known_alignment_for_access_p (dr))
6597         is_packed = not_size_aligned (DR_REF (dr));
6598
6599      if (targetm.vectorize.support_vector_misalignment
6600            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6601        return dr_unaligned_supported;
6602     }
6603
6604   /* Unsupported.  */
6605   return dr_unaligned_unsupported;
6606 }