gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT_INFO.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 121                                HOST_WIDE_INT *lhs_size_unit,
 122                                HOST_WIDE_INT *rhs_size_unit)
 123 {
 124   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 125   HOST_WIDE_INT lhs, rhs;
 126
 127   /* During the analysis phase, this function is called on arbitrary
 128      statements that might not have scalar results.  */
 129   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 130     return scalar_type;
 131
 132   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 133
 134   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 135   if (assign
 136       && (gimple_assign_cast_p (assign)
 137           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 140           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 141           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 142     {
 143       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 144
 145       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 146       if (rhs < lhs)
 147         scalar_type = rhs_type;
 148     }
 149
 150   *lhs_size_unit = lhs;
 151   *rhs_size_unit = rhs;
 152   return scalar_type;
 153 }
 154
 155
 156 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 157    tested at run-time.  Return TRUE if DDR was successfully inserted.
 158    Return false if versioning is not supported.  */
 159
 160 static bool
 161 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 162 {
 163   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 164
 165   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 166     return false;
 167
 168   if (!runtime_alias_check_p (ddr, loop,
 169                               optimize_loop_nest_for_speed_p (loop)))
 170     return false;
 171
 172   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 173   return true;
 174 }
 175
 176 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 177
 178 static void
 179 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 180 {
 181   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 182   for (unsigned int i = 0; i < checks.length(); ++i)
 183     if (checks[i] == value)
 184       return;
 185
 186   if (dump_enabled_p ())
 187     {
 188       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 189       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 190       dump_printf (MSG_NOTE, " is nonzero\n");
 191     }
 192   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 193 }
 194
 195 /* Return true if we know that the order of vectorized STMTINFO_A and
 196    vectorized STMTINFO_B will be the same as the order of STMTINFO_A and
 197    STMTINFO_B.  At least one of the statements is a write.  */
 198
 199 static bool
 200 vect_preserves_scalar_order_p (stmt_vec_info stmtinfo_a,
 201                                stmt_vec_info stmtinfo_b)
 202 {
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmtinfo_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmtinfo_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return (!STMT_VINFO_GATHER_SCATTER_P
 271                      (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
 272                   && !STMT_VINFO_GATHER_SCATTER_P
 273                         (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
 298   stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
 299   lambda_vector dist_v;
 300   unsigned int loop_depth;
 301
 302   /* In loop analysis all data references should be vectorizable.  */
 303   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 304       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 305     gcc_unreachable ();
 306
 307   /* Independent data accesses.  */
 308   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 309     return false;
 310
 311   if (dra == drb
 312       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 313     return false;
 314
 315   /* We do not have to consider dependences between accesses that belong
 316      to the same group, unless the stride could be smaller than the
 317      group size.  */
 318   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 319       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 321       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 322     return false;
 323
 324   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 325      least two scalar iterations, there is always also a true dependence.
 326      As the vectorizer does not re-order loads and stores we can ignore
 327      the anti-dependence if TBAA can disambiguate both DRs similar to the
 328      case with known negative distance anti-dependences (positive
 329      distance anti-dependences would violate TBAA constraints).  */
 330   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 331        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 332       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 333                                  get_alias_set (DR_REF (drb))))
 334     return false;
 335
 336   /* Unknown data dependence.  */
 337   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 338     {
 339       /* If user asserted safelen consecutive iterations can be
 340          executed concurrently, assume independence.  */
 341       if (loop->safelen >= 2)
 342         {
 343           if ((unsigned int) loop->safelen < *max_vf)
 344             *max_vf = loop->safelen;
 345           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 346           return false;
 347         }
 348
 349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 350           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                                "versioning for alias not supported for: "
 356                                "can't determine dependence between ");
 357               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 358                                  DR_REF (dra));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 360               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 361                                  DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364           return true;
 365         }
 366
 367       if (dump_enabled_p ())
 368         {
 369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                            "versioning for alias required: "
 371                            "can't determine dependence between ");
 372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 373                              DR_REF (dra));
 374           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 375           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 376                              DR_REF (drb));
 377           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 378         }
 379
 380       /* Add to list of ddrs that need to be tested at run-time.  */
 381       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 382     }
 383
 384   /* Known data dependence.  */
 385   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 386     {
 387       /* If user asserted safelen consecutive iterations can be
 388          executed concurrently, assume independence.  */
 389       if (loop->safelen >= 2)
 390         {
 391           if ((unsigned int) loop->safelen < *max_vf)
 392             *max_vf = loop->safelen;
 393           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 394           return false;
 395         }
 396
 397       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 398           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 399         {
 400           if (dump_enabled_p ())
 401             {
 402               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                "versioning for alias not supported for: "
 404                                "bad dist vector for ");
 405               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                  DR_REF (dra));
 407               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 408               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 409                                  DR_REF (drb));
 410               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411             }
 412           return true;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                            "versioning for alias required: "
 419                            "bad dist vector for ");
 420           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 423           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 424         }
 425       /* Add to list of ddrs that need to be tested at run-time.  */
 426       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 427     }
 428
 429   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 430
 431   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 432       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 433                                                 loop_depth, max_vf))
 434     return false;
 435
 436   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 437     {
 438       int dist = dist_v[loop_depth];
 439
 440       if (dump_enabled_p ())
 441         dump_printf_loc (MSG_NOTE, vect_location,
 442                          "dependence distance  = %d.\n", dist);
 443
 444       if (dist == 0)
 445         {
 446           if (dump_enabled_p ())
 447             {
 448               dump_printf_loc (MSG_NOTE, vect_location,
 449                                "dependence distance == 0 between ");
 450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 451               dump_printf (MSG_NOTE, " and ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 453               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 454             }
 455
 456           /* When we perform grouped accesses and perform implicit CSE
 457              by detecting equal accesses and doing disambiguation with
 458              runtime alias tests like for
 459                 .. = a[i];
 460                 .. = a[i+1];
 461                 a[i] = ..;
 462                 a[i+1] = ..;
 463                 *p = ..;
 464                 .. = a[i];
 465                 .. = a[i+1];
 466              where we will end up loading { a[i], a[i+1] } once, make
 467              sure that inserting group loads before the first load and
 468              stores after the last store will do the right thing.
 469              Similar for groups like
 470                 a[i] = ...;
 471                 ... = a[i];
 472                 a[i+1] = ...;
 473              where loads from the group interleave with the store.  */
 474           if (!vect_preserves_scalar_order_p (stmtinfo_a, stmtinfo_b))
 475             {
 476               if (dump_enabled_p ())
 477                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 478                                  "READ_WRITE dependence in interleaving.\n");
 479               return true;
 480             }
 481
 482           if (loop->safelen < 2)
 483             {
 484               tree indicator = dr_zero_step_indicator (dra);
 485               if (!indicator || integer_zerop (indicator))
 486                 {
 487                   if (dump_enabled_p ())
 488                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 489                                  "access also has a zero step\n");
 490                   return true;
 491                 }
 492               else if (TREE_CODE (indicator) != INTEGER_CST)
 493                 vect_check_nonzero_value (loop_vinfo, indicator);
 494             }
 495           continue;
 496         }
 497
 498       if (dist > 0 && DDR_REVERSED_P (ddr))
 499         {
 500           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 501              reversed (to make distance vector positive), and the actual
 502              distance is negative.  */
 503           if (dump_enabled_p ())
 504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 505                              "dependence distance negative.\n");
 506           /* Record a negative dependence distance to later limit the
 507              amount of stmt copying / unrolling we can perform.
 508              Only need to handle read-after-write dependence.  */
 509           if (DR_IS_READ (drb)
 510               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 511                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 512             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 513           continue;
 514         }
 515
 516       unsigned int abs_dist = abs (dist);
 517       if (abs_dist >= 2 && abs_dist < *max_vf)
 518         {
 519           /* The dependence distance requires reduction of the maximal
 520              vectorization factor.  */
 521           *max_vf = abs (dist);
 522           if (dump_enabled_p ())
 523             dump_printf_loc (MSG_NOTE, vect_location,
 524                              "adjusting maximal vectorization factor to %i\n",
 525                              *max_vf);
 526         }
 527
 528       if (abs_dist >= *max_vf)
 529         {
 530           /* Dependence distance does not create dependence, as far as
 531              vectorization is concerned, in this case.  */
 532           if (dump_enabled_p ())
 533             dump_printf_loc (MSG_NOTE, vect_location,
 534                              "dependence distance >= VF.\n");
 535           continue;
 536         }
 537
 538       if (dump_enabled_p ())
 539         {
 540           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 541                        "not vectorized, possible dependence "
 542                        "between data-refs ");
 543           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 544           dump_printf (MSG_NOTE,  " and ");
 545           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 546           dump_printf (MSG_NOTE,  "\n");
 547         }
 548
 549       return true;
 550     }
 551
 552   return false;
 553 }
 554
 555 /* Function vect_analyze_data_ref_dependences.
 556
 557    Examine all the data references in the loop, and make sure there do not
 558    exist any data dependences between them.  Set *MAX_VF according to
 559    the maximum vectorization factor the data dependences allow.  */
 560
 561 bool
 562 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 563                                    unsigned int *max_vf)
 564 {
 565   unsigned int i;
 566   struct data_dependence_relation *ddr;
 567
 568   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 569
 570   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 571     {
 572       LOOP_VINFO_DDRS (loop_vinfo)
 573         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 574                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 575       /* We need read-read dependences to compute
 576          STMT_VINFO_SAME_ALIGN_REFS.  */
 577       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 578                                           &LOOP_VINFO_DDRS (loop_vinfo),
 579                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 580                                           true);
 581       gcc_assert (res);
 582     }
 583
 584   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 585
 586   /* For epilogues we either have no aliases or alias versioning
 587      was applied to original loop.  Therefore we may just get max_vf
 588      using VF of original loop.  */
 589   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 590     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 591   else
 592     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 593       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 594         return false;
 595
 596   return true;
 597 }
 598
 599
 600 /* Function vect_slp_analyze_data_ref_dependence.
 601
 602    Return TRUE if there (might) exist a dependence between a memory-reference
 603    DRA and a memory-reference DRB.  When versioning for alias may check a
 604    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 605    the data dependence.  */
 606
 607 static bool
 608 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 609 {
 610   struct data_reference *dra = DDR_A (ddr);
 611   struct data_reference *drb = DDR_B (ddr);
 612
 613   /* We need to check dependences of statements marked as unvectorizable
 614      as well, they still can prohibit vectorization.  */
 615
 616   /* Independent data accesses.  */
 617   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 618     return false;
 619
 620   if (dra == drb)
 621     return false;
 622
 623   /* Read-read is OK.  */
 624   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 625     return false;
 626
 627   /* If dra and drb are part of the same interleaving chain consider
 628      them independent.  */
 629   if (STMT_VINFO_GROUPED_ACCESS (vect_dr_stmt (dra))
 630       && (DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dra))
 631           == DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (drb))))
 632     return false;
 633
 634   /* Unknown data dependence.  */
 635   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 636     {
 637       if  (dump_enabled_p ())
 638         {
 639           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                            "can't determine dependence between ");
 641           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 642           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 643           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 644           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 645         }
 646     }
 647   else if (dump_enabled_p ())
 648     {
 649       dump_printf_loc (MSG_NOTE, vect_location,
 650                        "determined dependence between ");
 651       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 652       dump_printf (MSG_NOTE, " and ");
 653       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 654       dump_printf (MSG_NOTE,  "\n");
 655     }
 656
 657   return true;
 658 }
 659
 660
 661 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 662    contain the vector of scalar stores of this instance if we are
 663    disambiguating the loads.  */
 664
 665 static bool
 666 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 667                                    vec<stmt_vec_info> stores,
 668                                    stmt_vec_info last_store_info)
 669 {
 670   /* This walks over all stmts involved in the SLP load/store done
 671      in NODE verifying we can sink them up to the last stmt in the
 672      group.  */
 673   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 674   vec_info *vinfo = last_access_info->vinfo;
 675   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 676     {
 677       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 678       if (access_info == last_access_info)
 679         continue;
 680       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 681       ao_ref ref;
 682       bool ref_initialized_p = false;
 683       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 684            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 685         {
 686           gimple *stmt = gsi_stmt (gsi);
 687           if (! gimple_vuse (stmt)
 688               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 689             continue;
 690
 691           /* If we couldn't record a (single) data reference for this
 692              stmt we have to resort to the alias oracle.  */
 693           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 694           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 695           if (!dr_b)
 696             {
 697               /* We are moving a store or sinking a load - this means
 698                  we cannot use TBAA for disambiguation.  */
 699               if (!ref_initialized_p)
 700                 ao_ref_init (&ref, DR_REF (dr_a));
 701               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 702                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 703                 return false;
 704               continue;
 705             }
 706
 707           bool dependent = false;
 708           /* If we run into a store of this same instance (we've just
 709              marked those) then delay dependence checking until we run
 710              into the last store because this is where it will have
 711              been sunk to (and we verify if we can do that as well).  */
 712           if (gimple_visited_p (stmt))
 713             {
 714               if (stmt_info != last_store_info)
 715                 continue;
 716               unsigned i;
 717               stmt_vec_info store_info;
 718               FOR_EACH_VEC_ELT (stores, i, store_info)
 719                 {
 720                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 721                   ddr_p ddr = initialize_data_dependence_relation
 722                                 (dr_a, store_dr, vNULL);
 723                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 724                   free_dependence_relation (ddr);
 725                   if (dependent)
 726                     break;
 727                 }
 728             }
 729           else
 730             {
 731               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 732                                                                dr_b, vNULL);
 733               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 734               free_dependence_relation (ddr);
 735             }
 736           if (dependent)
 737             return false;
 738         }
 739     }
 740   return true;
 741 }
 742
 743
 744 /* Function vect_analyze_data_ref_dependences.
 745
 746    Examine all the data references in the basic-block, and make sure there
 747    do not exist any data dependences between them.  Set *MAX_VF according to
 748    the maximum vectorization factor the data dependences allow.  */
 749
 750 bool
 751 vect_slp_analyze_instance_dependence (slp_instance instance)
 752 {
 753   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 754
 755   /* The stores of this instance are at the root of the SLP tree.  */
 756   slp_tree store = SLP_INSTANCE_TREE (instance);
 757   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 758     store = NULL;
 759
 760   /* Verify we can sink stores to the vectorized stmt insert location.  */
 761   stmt_vec_info last_store_info = NULL;
 762   if (store)
 763     {
 764       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 765         return false;
 766
 767       /* Mark stores in this instance and remember the last one.  */
 768       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 769       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 770         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 771     }
 772
 773   bool res = true;
 774
 775   /* Verify we can sink loads to the vectorized stmt insert location,
 776      special-casing stores of this instance.  */
 777   slp_tree load;
 778   unsigned int i;
 779   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 780     if (! vect_slp_analyze_node_dependences (instance, load,
 781                                              store
 782                                              ? SLP_TREE_SCALAR_STMTS (store)
 783                                              : vNULL, last_store_info))
 784       {
 785         res = false;
 786         break;
 787       }
 788
 789   /* Unset the visited flag.  */
 790   if (store)
 791     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 792       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 793
 794   return res;
 795 }
 796
 797 /* Record the base alignment guarantee given by DRB, which occurs
 798    in STMT_INFO.  */
 799
 800 static void
 801 vect_record_base_alignment (stmt_vec_info stmt_info,
 802                             innermost_loop_behavior *drb)
 803 {
 804   vec_info *vinfo = stmt_info->vinfo;
 805   bool existed;
 806   innermost_loop_behavior *&entry
 807     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 808   if (!existed || entry->base_alignment < drb->base_alignment)
 809     {
 810       entry = drb;
 811       if (dump_enabled_p ())
 812         {
 813           dump_printf_loc (MSG_NOTE, vect_location,
 814                            "recording new base alignment for ");
 815           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 816           dump_printf (MSG_NOTE, "\n");
 817           dump_printf_loc (MSG_NOTE, vect_location,
 818                            "  alignment:    %d\n", drb->base_alignment);
 819           dump_printf_loc (MSG_NOTE, vect_location,
 820                            "  misalignment: %d\n", drb->base_misalignment);
 821           dump_printf_loc (MSG_NOTE, vect_location,
 822                            "  based on:     ");
 823           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 824         }
 825     }
 826 }
 827
 828 /* If the region we're going to vectorize is reached, all unconditional
 829    data references occur at least once.  We can therefore pool the base
 830    alignment guarantees from each unconditional reference.  Do this by
 831    going through all the data references in VINFO and checking whether
 832    the containing statement makes the reference unconditionally.  If so,
 833    record the alignment of the base address in VINFO so that it can be
 834    used for all other references with the same base.  */
 835
 836 void
 837 vect_record_base_alignments (vec_info *vinfo)
 838 {
 839   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 840   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 841   data_reference *dr;
 842   unsigned int i;
 843   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 844     {
 845       stmt_vec_info stmt_info = vect_dr_stmt (dr);
 846       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 847           && STMT_VINFO_VECTORIZABLE (stmt_info)
 848           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 849         {
 850           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 851
 852           /* If DR is nested in the loop that is being vectorized, we can also
 853              record the alignment of the base wrt the outer loop.  */
 854           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 855             vect_record_base_alignment
 856               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 857         }
 858     }
 859 }
 860
 861 /* Return the target alignment for the vectorized form of DR.  */
 862
 863 static unsigned int
 864 vect_calculate_target_alignment (struct data_reference *dr)
 865 {
 866   stmt_vec_info stmt_info = vect_dr_stmt (dr);
 867   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 868   return targetm.vectorize.preferred_vector_alignment (vectype);
 869 }
 870
 871 /* Function vect_compute_data_ref_alignment
 872
 873    Compute the misalignment of the data reference DR.
 874
 875    Output:
 876    1. DR_MISALIGNMENT (DR) is defined.
 877
 878    FOR NOW: No analysis is actually performed. Misalignment is calculated
 879    only for trivial cases. TODO.  */
 880
 881 static void
 882 vect_compute_data_ref_alignment (struct data_reference *dr)
 883 {
 884   stmt_vec_info stmt_info = vect_dr_stmt (dr);
 885   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 886   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 887   struct loop *loop = NULL;
 888   tree ref = DR_REF (dr);
 889   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 890
 891   if (dump_enabled_p ())
 892     dump_printf_loc (MSG_NOTE, vect_location,
 893                      "vect_compute_data_ref_alignment:\n");
 894
 895   if (loop_vinfo)
 896     loop = LOOP_VINFO_LOOP (loop_vinfo);
 897
 898   /* Initialize misalignment to unknown.  */
 899   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 900
 901   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 902     return;
 903
 904   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 905   bool step_preserves_misalignment_p;
 906
 907   unsigned HOST_WIDE_INT vector_alignment
 908     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 909   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 910
 911   /* No step for BB vectorization.  */
 912   if (!loop)
 913     {
 914       gcc_assert (integer_zerop (drb->step));
 915       step_preserves_misalignment_p = true;
 916     }
 917
 918   /* In case the dataref is in an inner-loop of the loop that is being
 919      vectorized (LOOP), we use the base and misalignment information
 920      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 921      stays the same throughout the execution of the inner-loop, which is why
 922      we have to check that the stride of the dataref in the inner-loop evenly
 923      divides by the vector alignment.  */
 924   else if (nested_in_vect_loop_p (loop, stmt_info))
 925     {
 926       step_preserves_misalignment_p
 927         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 928
 929       if (dump_enabled_p ())
 930         {
 931           if (step_preserves_misalignment_p)
 932             dump_printf_loc (MSG_NOTE, vect_location,
 933                              "inner step divides the vector alignment.\n");
 934           else
 935             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 936                              "inner step doesn't divide the vector"
 937                              " alignment.\n");
 938         }
 939     }
 940
 941   /* Similarly we can only use base and misalignment information relative to
 942      an innermost loop if the misalignment stays the same throughout the
 943      execution of the loop.  As above, this is the case if the stride of
 944      the dataref evenly divides by the alignment.  */
 945   else
 946     {
 947       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 948       step_preserves_misalignment_p
 949         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 950
 951       if (!step_preserves_misalignment_p && dump_enabled_p ())
 952         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 953                          "step doesn't divide the vector alignment.\n");
 954     }
 955
 956   unsigned int base_alignment = drb->base_alignment;
 957   unsigned int base_misalignment = drb->base_misalignment;
 958
 959   /* Calculate the maximum of the pooled base address alignment and the
 960      alignment that we can compute for DR itself.  */
 961   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 962   if (entry && base_alignment < (*entry)->base_alignment)
 963     {
 964       base_alignment = (*entry)->base_alignment;
 965       base_misalignment = (*entry)->base_misalignment;
 966     }
 967
 968   if (drb->offset_alignment < vector_alignment
 969       || !step_preserves_misalignment_p
 970       /* We need to know whether the step wrt the vectorized loop is
 971          negative when computing the starting misalignment below.  */
 972       || TREE_CODE (drb->step) != INTEGER_CST)
 973     {
 974       if (dump_enabled_p ())
 975         {
 976           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 977                            "Unknown alignment for access: ");
 978           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 979           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 980         }
 981       return;
 982     }
 983
 984   if (base_alignment < vector_alignment)
 985     {
 986       unsigned int max_alignment;
 987       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 988       if (max_alignment < vector_alignment
 989           || !vect_can_force_dr_alignment_p (base,
 990                                              vector_alignment * BITS_PER_UNIT))
 991         {
 992           if (dump_enabled_p ())
 993             {
 994               dump_printf_loc (MSG_NOTE, vect_location,
 995                                "can't force alignment of ref: ");
 996               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 997               dump_printf (MSG_NOTE, "\n");
 998             }
 999           return;
1000         }
1001
1002       /* Force the alignment of the decl.
1003          NOTE: This is the only change to the code we make during
1004          the analysis phase, before deciding to vectorize the loop.  */
1005       if (dump_enabled_p ())
1006         {
1007           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1008           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1009           dump_printf (MSG_NOTE, "\n");
1010         }
1011
1012       DR_VECT_AUX (dr)->base_decl = base;
1013       DR_VECT_AUX (dr)->base_misaligned = true;
1014       base_misalignment = 0;
1015     }
1016   poly_int64 misalignment
1017     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1018
1019   /* If this is a backward running DR then first access in the larger
1020      vectype actually is N-1 elements before the address in the DR.
1021      Adjust misalign accordingly.  */
1022   if (tree_int_cst_sgn (drb->step) < 0)
1023     /* PLUS because STEP is negative.  */
1024     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1025                      * TREE_INT_CST_LOW (drb->step));
1026
1027   unsigned int const_misalignment;
1028   if (!known_misalignment (misalignment, vector_alignment,
1029                            &const_misalignment))
1030     {
1031       if (dump_enabled_p ())
1032         {
1033           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034                            "Non-constant misalignment for access: ");
1035           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1036           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1037         }
1038       return;
1039     }
1040
1041   SET_DR_MISALIGNMENT (dr, const_misalignment);
1042
1043   if (dump_enabled_p ())
1044     {
1045       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1047       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1048       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1049     }
1050
1051   return;
1052 }
1053
1054 /* Function vect_update_misalignment_for_peel.
1055    Sets DR's misalignment
1056    - to 0 if it has the same alignment as DR_PEEL,
1057    - to the misalignment computed using NPEEL if DR's salignment is known,
1058    - to -1 (unknown) otherwise.
1059
1060    DR - the data reference whose misalignment is to be adjusted.
1061    DR_PEEL - the data reference whose misalignment is being made
1062              zero in the vector loop by the peel.
1063    NPEEL - the number of iterations in the peel loop if the misalignment
1064            of DR_PEEL is known at compile time.  */
1065
1066 static void
1067 vect_update_misalignment_for_peel (struct data_reference *dr,
1068                                    struct data_reference *dr_peel, int npeel)
1069 {
1070   unsigned int i;
1071   vec<dr_p> same_aligned_drs;
1072   struct data_reference *current_dr;
1073   int dr_size = vect_get_scalar_dr_size (dr);
1074   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1075   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1076   stmt_vec_info peel_stmt_info = vect_dr_stmt (dr_peel);
1077
1078  /* For interleaved data accesses the step in the loop must be multiplied by
1079      the size of the interleaving group.  */
1080   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1081     dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1082   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1083     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1084
1085   /* It can be assumed that the data refs with the same alignment as dr_peel
1086      are aligned in the vector loop.  */
1087   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr_peel));
1088   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1089     {
1090       if (current_dr != dr)
1091         continue;
1092       gcc_assert (!known_alignment_for_access_p (dr)
1093                   || !known_alignment_for_access_p (dr_peel)
1094                   || (DR_MISALIGNMENT (dr) / dr_size
1095                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1096       SET_DR_MISALIGNMENT (dr, 0);
1097       return;
1098     }
1099
1100   if (known_alignment_for_access_p (dr)
1101       && known_alignment_for_access_p (dr_peel))
1102     {
1103       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1104       int misal = DR_MISALIGNMENT (dr);
1105       misal += negative ? -npeel * dr_size : npeel * dr_size;
1106       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1107       SET_DR_MISALIGNMENT (dr, misal);
1108       return;
1109     }
1110
1111   if (dump_enabled_p ())
1112     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1113                      "to unknown (-1).\n");
1114   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1115 }
1116
1117
1118 /* Function verify_data_ref_alignment
1119
1120    Return TRUE if DR can be handled with respect to alignment.  */
1121
1122 static bool
1123 verify_data_ref_alignment (data_reference_p dr)
1124 {
1125   enum dr_alignment_support supportable_dr_alignment
1126     = vect_supportable_dr_alignment (dr, false);
1127   if (!supportable_dr_alignment)
1128     {
1129       if (dump_enabled_p ())
1130         {
1131           if (DR_IS_READ (dr))
1132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                              "not vectorized: unsupported unaligned load.");
1134           else
1135             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136                              "not vectorized: unsupported unaligned "
1137                              "store.");
1138
1139           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1140                              DR_REF (dr));
1141           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1142         }
1143       return false;
1144     }
1145
1146   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1147     dump_printf_loc (MSG_NOTE, vect_location,
1148                      "Vectorizing an unaligned access.\n");
1149
1150   return true;
1151 }
1152
1153 /* Function vect_verify_datarefs_alignment
1154
1155    Return TRUE if all data references in the loop can be
1156    handled with respect to alignment.  */
1157
1158 bool
1159 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1160 {
1161   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1162   struct data_reference *dr;
1163   unsigned int i;
1164
1165   FOR_EACH_VEC_ELT (datarefs, i, dr)
1166     {
1167       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1168
1169       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1170         continue;
1171
1172       /* For interleaving, only the alignment of the first access matters.   */
1173       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1174           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1175         continue;
1176
1177       /* Strided accesses perform only component accesses, alignment is
1178          irrelevant for them.  */
1179       if (STMT_VINFO_STRIDED_P (stmt_info)
1180           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1181         continue;
1182
1183       if (! verify_data_ref_alignment (dr))
1184         return false;
1185     }
1186
1187   return true;
1188 }
1189
1190 /* Given an memory reference EXP return whether its alignment is less
1191    than its size.  */
1192
1193 static bool
1194 not_size_aligned (tree exp)
1195 {
1196   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1197     return true;
1198
1199   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1200           > get_object_alignment (exp));
1201 }
1202
1203 /* Function vector_alignment_reachable_p
1204
1205    Return true if vector alignment for DR is reachable by peeling
1206    a few loop iterations.  Return false otherwise.  */
1207
1208 static bool
1209 vector_alignment_reachable_p (struct data_reference *dr)
1210 {
1211   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1212   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1213
1214   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1215     {
1216       /* For interleaved access we peel only if number of iterations in
1217          the prolog loop ({VF - misalignment}), is a multiple of the
1218          number of the interleaved accesses.  */
1219       int elem_size, mis_in_elements;
1220
1221       /* FORNOW: handle only known alignment.  */
1222       if (!known_alignment_for_access_p (dr))
1223         return false;
1224
1225       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1226       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1227       elem_size = vector_element_size (vector_size, nelements);
1228       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1229
1230       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1231         return false;
1232     }
1233
1234   /* If misalignment is known at the compile time then allow peeling
1235      only if natural alignment is reachable through peeling.  */
1236   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1237     {
1238       HOST_WIDE_INT elmsize =
1239                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1240       if (dump_enabled_p ())
1241         {
1242           dump_printf_loc (MSG_NOTE, vect_location,
1243                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1244           dump_printf (MSG_NOTE,
1245                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1246         }
1247       if (DR_MISALIGNMENT (dr) % elmsize)
1248         {
1249           if (dump_enabled_p ())
1250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                              "data size does not divide the misalignment.\n");
1252           return false;
1253         }
1254     }
1255
1256   if (!known_alignment_for_access_p (dr))
1257     {
1258       tree type = TREE_TYPE (DR_REF (dr));
1259       bool is_packed = not_size_aligned (DR_REF (dr));
1260       if (dump_enabled_p ())
1261         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1262                          "Unknown misalignment, %snaturally aligned\n",
1263                          is_packed ? "not " : "");
1264       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1265     }
1266
1267   return true;
1268 }
1269
1270
1271 /* Calculate the cost of the memory access represented by DR.  */
1272
1273 static void
1274 vect_get_data_access_cost (struct data_reference *dr,
1275                            unsigned int *inside_cost,
1276                            unsigned int *outside_cost,
1277                            stmt_vector_for_cost *body_cost_vec,
1278                            stmt_vector_for_cost *prologue_cost_vec)
1279 {
1280   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1281   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282   int ncopies;
1283
1284   if (PURE_SLP_STMT (stmt_info))
1285     ncopies = 1;
1286   else
1287     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1288
1289   if (DR_IS_READ (dr))
1290     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291                         prologue_cost_vec, body_cost_vec, false);
1292   else
1293     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1294
1295   if (dump_enabled_p ())
1296     dump_printf_loc (MSG_NOTE, vect_location,
1297                      "vect_get_data_access_cost: inside_cost = %d, "
1298                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1299 }
1300
1301
1302 typedef struct _vect_peel_info
1303 {
1304   struct data_reference *dr;
1305   int npeel;
1306   unsigned int count;
1307 } *vect_peel_info;
1308
1309 typedef struct _vect_peel_extended_info
1310 {
1311   struct _vect_peel_info peel_info;
1312   unsigned int inside_cost;
1313   unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1315
1316
1317 /* Peeling hashtable helpers.  */
1318
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1320 {
1321   static inline hashval_t hash (const _vect_peel_info *);
1322   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1323 };
1324
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1327 {
1328   return (hashval_t) peel_info->npeel;
1329 }
1330
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1333 {
1334   return (a->npeel == b->npeel);
1335 }
1336
1337
1338 /* Insert DR into peeling hash table with NPEEL as key.  */
1339
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342                           loop_vec_info loop_vinfo, struct data_reference *dr,
1343                           int npeel)
1344 {
1345   struct _vect_peel_info elem, *slot;
1346   _vect_peel_info **new_slot;
1347   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1348
1349   elem.npeel = npeel;
1350   slot = peeling_htab->find (&elem);
1351   if (slot)
1352     slot->count++;
1353   else
1354     {
1355       slot = XNEW (struct _vect_peel_info);
1356       slot->npeel = npeel;
1357       slot->dr = dr;
1358       slot->count = 1;
1359       new_slot = peeling_htab->find_slot (slot, INSERT);
1360       *new_slot = slot;
1361     }
1362
1363   if (!supportable_dr_alignment
1364       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365     slot->count += VECT_MAX_COST;
1366 }
1367
1368
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370    number of data accesses.  */
1371
1372 int
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374                                      _vect_peel_extended_info *max)
1375 {
1376   vect_peel_info elem = *slot;
1377
1378   if (elem->count > max->peel_info.count
1379       || (elem->count == max->peel_info.count
1380           && max->peel_info.npeel > elem->npeel))
1381     {
1382       max->peel_info.npeel = elem->npeel;
1383       max->peel_info.count = elem->count;
1384       max->peel_info.dr = elem->dr;
1385     }
1386
1387   return 1;
1388 }
1389
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392    misalignment will be zero after peeling.  */
1393
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396                                 struct data_reference *dr0,
1397                                 unsigned int *inside_cost,
1398                                 unsigned int *outside_cost,
1399                                 stmt_vector_for_cost *body_cost_vec,
1400                                 stmt_vector_for_cost *prologue_cost_vec,
1401                                 unsigned int npeel,
1402                                 bool unknown_misalignment)
1403 {
1404   unsigned i;
1405   data_reference *dr;
1406
1407   FOR_EACH_VEC_ELT (datarefs, i, dr)
1408     {
1409       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1410       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1411         continue;
1412
1413       /* For interleaving, only the alignment of the first access
1414          matters.  */
1415       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1416           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1417         continue;
1418
1419       /* Strided accesses perform only component accesses, alignment is
1420          irrelevant for them.  */
1421       if (STMT_VINFO_STRIDED_P (stmt_info)
1422           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1423         continue;
1424
1425       int save_misalignment;
1426       save_misalignment = DR_MISALIGNMENT (dr);
1427       if (npeel == 0)
1428         ;
1429       else if (unknown_misalignment && dr == dr0)
1430         SET_DR_MISALIGNMENT (dr, 0);
1431       else
1432         vect_update_misalignment_for_peel (dr, dr0, npeel);
1433       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1434                                  body_cost_vec, prologue_cost_vec);
1435       SET_DR_MISALIGNMENT (dr, save_misalignment);
1436     }
1437 }
1438
1439 /* Traverse peeling hash table and calculate cost for each peeling option.
1440    Find the one with the lowest cost.  */
1441
1442 int
1443 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1444                                    _vect_peel_extended_info *min)
1445 {
1446   vect_peel_info elem = *slot;
1447   int dummy;
1448   unsigned int inside_cost = 0, outside_cost = 0;
1449   stmt_vec_info stmt_info = vect_dr_stmt (elem->dr);
1450   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1451   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1452                        epilogue_cost_vec;
1453
1454   prologue_cost_vec.create (2);
1455   body_cost_vec.create (2);
1456   epilogue_cost_vec.create (2);
1457
1458   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1459                                   elem->dr, &inside_cost, &outside_cost,
1460                                   &body_cost_vec, &prologue_cost_vec,
1461                                   elem->npeel, false);
1462
1463   body_cost_vec.release ();
1464
1465   outside_cost += vect_get_known_peeling_cost
1466     (loop_vinfo, elem->npeel, &dummy,
1467      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1468      &prologue_cost_vec, &epilogue_cost_vec);
1469
1470   /* Prologue and epilogue costs are added to the target model later.
1471      These costs depend only on the scalar iteration cost, the
1472      number of peeling iterations finally chosen, and the number of
1473      misaligned statements.  So discard the information found here.  */
1474   prologue_cost_vec.release ();
1475   epilogue_cost_vec.release ();
1476
1477   if (inside_cost < min->inside_cost
1478       || (inside_cost == min->inside_cost
1479           && outside_cost < min->outside_cost))
1480     {
1481       min->inside_cost = inside_cost;
1482       min->outside_cost = outside_cost;
1483       min->peel_info.dr = elem->dr;
1484       min->peel_info.npeel = elem->npeel;
1485       min->peel_info.count = elem->count;
1486     }
1487
1488   return 1;
1489 }
1490
1491
1492 /* Choose best peeling option by traversing peeling hash table and either
1493    choosing an option with the lowest cost (if cost model is enabled) or the
1494    option that aligns as many accesses as possible.  */
1495
1496 static struct _vect_peel_extended_info
1497 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1498                                        loop_vec_info loop_vinfo)
1499 {
1500    struct _vect_peel_extended_info res;
1501
1502    res.peel_info.dr = NULL;
1503
1504    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1505      {
1506        res.inside_cost = INT_MAX;
1507        res.outside_cost = INT_MAX;
1508        peeling_htab->traverse <_vect_peel_extended_info *,
1509                                vect_peeling_hash_get_lowest_cost> (&res);
1510      }
1511    else
1512      {
1513        res.peel_info.count = 0;
1514        peeling_htab->traverse <_vect_peel_extended_info *,
1515                                vect_peeling_hash_get_most_frequent> (&res);
1516        res.inside_cost = 0;
1517        res.outside_cost = 0;
1518      }
1519
1520    return res;
1521 }
1522
1523 /* Return true if the new peeling NPEEL is supported.  */
1524
1525 static bool
1526 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1527                           unsigned npeel)
1528 {
1529   unsigned i;
1530   struct data_reference *dr = NULL;
1531   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1532   enum dr_alignment_support supportable_dr_alignment;
1533
1534   /* Ensure that all data refs can be vectorized after the peel.  */
1535   FOR_EACH_VEC_ELT (datarefs, i, dr)
1536     {
1537       int save_misalignment;
1538
1539       if (dr == dr0)
1540         continue;
1541
1542       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1543       /* For interleaving, only the alignment of the first access
1544          matters.  */
1545       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1546           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1547         continue;
1548
1549       /* Strided accesses perform only component accesses, alignment is
1550          irrelevant for them.  */
1551       if (STMT_VINFO_STRIDED_P (stmt_info)
1552           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1553         continue;
1554
1555       save_misalignment = DR_MISALIGNMENT (dr);
1556       vect_update_misalignment_for_peel (dr, dr0, npeel);
1557       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1558       SET_DR_MISALIGNMENT (dr, save_misalignment);
1559
1560       if (!supportable_dr_alignment)
1561         return false;
1562     }
1563
1564   return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569    This pass will use loop versioning and loop peeling in order to enhance
1570    the alignment of data references in the loop.
1571
1572    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573    original loop is to be vectorized.  Any other loops that are created by
1574    the transformations performed in this pass - are not supposed to be
1575    vectorized.  This restriction will be relaxed.
1576
1577    This pass will require a cost model to guide it whether to apply peeling
1578    or versioning or a combination of the two.  For example, the scheme that
1579    intel uses when given a loop with several memory accesses, is as follows:
1580    choose one memory access ('p') which alignment you want to force by doing
1581    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1582    other accesses are not necessarily aligned, or (2) use loop versioning to
1583    generate one loop in which all accesses are aligned, and another loop in
1584    which only 'p' is necessarily aligned.
1585
1586    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590    Devising a cost model is the most critical aspect of this work.  It will
1591    guide us on which access to peel for, whether to use loop versioning, how
1592    many versions to create, etc.  The cost model will probably consist of
1593    generic considerations as well as target specific considerations (on
1594    powerpc for example, misaligned stores are more painful than misaligned
1595    loads).
1596
1597    Here are the general steps involved in alignment enhancements:
1598
1599      -- original loop, before alignment analysis:
1600         for (i=0; i<N; i++){
1601           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1602           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1603         }
1604
1605      -- After vect_compute_data_refs_alignment:
1606         for (i=0; i<N; i++){
1607           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1608           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1609         }
1610
1611      -- Possibility 1: we do loop versioning:
1612      if (p is aligned) {
1613         for (i=0; i<N; i++){    # loop 1A
1614           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1615           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1616         }
1617      }
1618      else {
1619         for (i=0; i<N; i++){    # loop 1B
1620           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1621           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1622         }
1623      }
1624
1625      -- Possibility 2: we do loop peeling:
1626      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1627         x = q[i];
1628         p[i] = y;
1629      }
1630      for (i = 3; i < N; i++){   # loop 2A
1631         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1632         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1633      }
1634
1635      -- Possibility 3: combination of loop peeling and versioning:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      if (p is aligned) {
1641         for (i = 3; i<N; i++){  # loop 3A
1642           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1643           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1644         }
1645      }
1646      else {
1647         for (i = 3; i<N; i++){  # loop 3B
1648           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1649           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1650         }
1651      }
1652
1653      These loops are later passed to loop_transform to be vectorized.  The
1654      vectorizer will use the alignment information to guide the transformation
1655      (whether to generate regular loads/stores, or with special handling for
1656      misalignment).  */
1657
1658 bool
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   enum dr_alignment_support supportable_dr_alignment;
1664   struct data_reference *dr0 = NULL, *first_store = NULL;
1665   struct data_reference *dr;
1666   unsigned int i, j;
1667   bool do_peeling = false;
1668   bool do_versioning = false;
1669   bool stat;
1670   unsigned int npeel = 0;
1671   bool one_misalignment_known = false;
1672   bool one_misalignment_unknown = false;
1673   bool one_dr_unsupportable = false;
1674   struct data_reference *unsupportable_dr = NULL;
1675   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676   unsigned possible_npeel_number = 1;
1677   tree vectype;
1678   unsigned int mis, same_align_drs_max = 0;
1679   hash_table<peel_info_hasher> peeling_htab (1);
1680
1681   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708
1709   /* (1) Peeling to force alignment.  */
1710
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1722
1723       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1724         continue;
1725
1726       /* For interleaving, only the alignment of the first access
1727          matters.  */
1728       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1729           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1730         continue;
1731
1732       /* For scatter-gather or invariant accesses there is nothing
1733          to enhance.  */
1734       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1735           || integer_zerop (DR_STEP (dr)))
1736         continue;
1737
1738       /* Strided accesses perform only component accesses, alignment is
1739          irrelevant for them.  */
1740       if (STMT_VINFO_STRIDED_P (stmt_info)
1741           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1742         continue;
1743
1744       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1745       do_peeling = vector_alignment_reachable_p (dr);
1746       if (do_peeling)
1747         {
1748           if (known_alignment_for_access_p (dr))
1749             {
1750               unsigned int npeel_tmp = 0;
1751               bool negative = tree_int_cst_compare (DR_STEP (dr),
1752                                                     size_zero_node) < 0;
1753
1754               vectype = STMT_VINFO_VECTYPE (stmt_info);
1755               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1756               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1757               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1758               if (DR_MISALIGNMENT (dr) != 0)
1759                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1760
1761               /* For multiple types, it is possible that the bigger type access
1762                  will have more than one peeling option.  E.g., a loop with two
1763                  types: one of size (vector size / 4), and the other one of
1764                  size (vector size / 8).  Vectorization factor will 8.  If both
1765                  accesses are misaligned by 3, the first one needs one scalar
1766                  iteration to be aligned, and the second one needs 5.  But the
1767                  first one will be aligned also by peeling 5 scalar
1768                  iterations, and in that case both accesses will be aligned.
1769                  Hence, except for the immediate peeling amount, we also want
1770                  to try to add full vector size, while we don't exceed
1771                  vectorization factor.
1772                  We do this automatically for cost model, since we calculate
1773                  cost for every peeling option.  */
1774               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1775                 {
1776                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1777                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1778                   possible_npeel_number
1779                     = vect_get_num_vectors (nscalars, vectype);
1780
1781                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1782                      allow peeling NELEMENTS.  */
1783                   if (DR_MISALIGNMENT (dr) == 0)
1784                     possible_npeel_number++;
1785                 }
1786
1787               /* Save info about DR in the hash table.  Also include peeling
1788                  amounts according to the explanation above.  */
1789               for (j = 0; j < possible_npeel_number; j++)
1790                 {
1791                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1792                                             dr, npeel_tmp);
1793                   npeel_tmp += target_align / dr_size;
1794                 }
1795
1796               one_misalignment_known = true;
1797             }
1798           else
1799             {
1800               /* If we don't know any misalignment values, we prefer
1801                  peeling for data-ref that has the maximum number of data-refs
1802                  with the same alignment, unless the target prefers to align
1803                  stores over load.  */
1804               unsigned same_align_drs
1805                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1806               if (!dr0
1807                   || same_align_drs_max < same_align_drs)
1808                 {
1809                   same_align_drs_max = same_align_drs;
1810                   dr0 = dr;
1811                 }
1812               /* For data-refs with the same number of related
1813                  accesses prefer the one where the misalign
1814                  computation will be invariant in the outermost loop.  */
1815               else if (same_align_drs_max == same_align_drs)
1816                 {
1817                   struct loop *ivloop0, *ivloop;
1818                   ivloop0 = outermost_invariant_loop_for_expr
1819                     (loop, DR_BASE_ADDRESS (dr0));
1820                   ivloop = outermost_invariant_loop_for_expr
1821                     (loop, DR_BASE_ADDRESS (dr));
1822                   if ((ivloop && !ivloop0)
1823                       || (ivloop && ivloop0
1824                           && flow_loop_nested_p (ivloop, ivloop0)))
1825                     dr0 = dr;
1826                 }
1827
1828               one_misalignment_unknown = true;
1829
1830               /* Check for data refs with unsupportable alignment that
1831                  can be peeled.  */
1832               if (!supportable_dr_alignment)
1833               {
1834                 one_dr_unsupportable = true;
1835                 unsupportable_dr = dr;
1836               }
1837
1838               if (!first_store && DR_IS_WRITE (dr))
1839                 first_store = dr;
1840             }
1841         }
1842       else
1843         {
1844           if (!aligned_access_p (dr))
1845             {
1846               if (dump_enabled_p ())
1847                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                                  "vector alignment may not be reachable\n");
1849               break;
1850             }
1851         }
1852     }
1853
1854   /* Check if we can possibly peel the loop.  */
1855   if (!vect_can_advance_ivs_p (loop_vinfo)
1856       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1857       || loop->inner)
1858     do_peeling = false;
1859
1860   struct _vect_peel_extended_info peel_for_known_alignment;
1861   struct _vect_peel_extended_info peel_for_unknown_alignment;
1862   struct _vect_peel_extended_info best_peel;
1863
1864   peel_for_unknown_alignment.inside_cost = INT_MAX;
1865   peel_for_unknown_alignment.outside_cost = INT_MAX;
1866   peel_for_unknown_alignment.peel_info.count = 0;
1867
1868   if (do_peeling
1869       && one_misalignment_unknown)
1870     {
1871       /* Check if the target requires to prefer stores over loads, i.e., if
1872          misaligned stores are more expensive than misaligned loads (taking
1873          drs with same alignment into account).  */
1874       unsigned int load_inside_cost = 0;
1875       unsigned int load_outside_cost = 0;
1876       unsigned int store_inside_cost = 0;
1877       unsigned int store_outside_cost = 0;
1878       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1879
1880       stmt_vector_for_cost dummy;
1881       dummy.create (2);
1882       vect_get_peeling_costs_all_drs (datarefs, dr0,
1883                                       &load_inside_cost,
1884                                       &load_outside_cost,
1885                                       &dummy, &dummy, estimated_npeels, true);
1886       dummy.release ();
1887
1888       if (first_store)
1889         {
1890           dummy.create (2);
1891           vect_get_peeling_costs_all_drs (datarefs, first_store,
1892                                           &store_inside_cost,
1893                                           &store_outside_cost,
1894                                           &dummy, &dummy,
1895                                           estimated_npeels, true);
1896           dummy.release ();
1897         }
1898       else
1899         {
1900           store_inside_cost = INT_MAX;
1901           store_outside_cost = INT_MAX;
1902         }
1903
1904       if (load_inside_cost > store_inside_cost
1905           || (load_inside_cost == store_inside_cost
1906               && load_outside_cost > store_outside_cost))
1907         {
1908           dr0 = first_store;
1909           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1910           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1911         }
1912       else
1913         {
1914           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1915           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1916         }
1917
1918       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1919       prologue_cost_vec.create (2);
1920       epilogue_cost_vec.create (2);
1921
1922       int dummy2;
1923       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1924         (loop_vinfo, estimated_npeels, &dummy2,
1925          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1926          &prologue_cost_vec, &epilogue_cost_vec);
1927
1928       prologue_cost_vec.release ();
1929       epilogue_cost_vec.release ();
1930
1931       peel_for_unknown_alignment.peel_info.count = 1
1932         + STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr0)).length ();
1933     }
1934
1935   peel_for_unknown_alignment.peel_info.npeel = 0;
1936   peel_for_unknown_alignment.peel_info.dr = dr0;
1937
1938   best_peel = peel_for_unknown_alignment;
1939
1940   peel_for_known_alignment.inside_cost = INT_MAX;
1941   peel_for_known_alignment.outside_cost = INT_MAX;
1942   peel_for_known_alignment.peel_info.count = 0;
1943   peel_for_known_alignment.peel_info.dr = NULL;
1944
1945   if (do_peeling && one_misalignment_known)
1946     {
1947       /* Peeling is possible, but there is no data access that is not supported
1948          unless aligned.  So we try to choose the best possible peeling from
1949          the hash table.  */
1950       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1951         (&peeling_htab, loop_vinfo);
1952     }
1953
1954   /* Compare costs of peeling for known and unknown alignment. */
1955   if (peel_for_known_alignment.peel_info.dr != NULL
1956       && peel_for_unknown_alignment.inside_cost
1957       >= peel_for_known_alignment.inside_cost)
1958     {
1959       best_peel = peel_for_known_alignment;
1960
1961       /* If the best peeling for known alignment has NPEEL == 0, perform no
1962          peeling at all except if there is an unsupportable dr that we can
1963          align.  */
1964       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1965         do_peeling = false;
1966     }
1967
1968   /* If there is an unsupportable data ref, prefer this over all choices so far
1969      since we'd have to discard a chosen peeling except when it accidentally
1970      aligned the unsupportable data ref.  */
1971   if (one_dr_unsupportable)
1972     dr0 = unsupportable_dr;
1973   else if (do_peeling)
1974     {
1975       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1976          TODO: Use nopeel_outside_cost or get rid of it?  */
1977       unsigned nopeel_inside_cost = 0;
1978       unsigned nopeel_outside_cost = 0;
1979
1980       stmt_vector_for_cost dummy;
1981       dummy.create (2);
1982       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1983                                       &nopeel_outside_cost, &dummy, &dummy,
1984                                       0, false);
1985       dummy.release ();
1986
1987       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1988          costs will be recorded.  */
1989       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1990       prologue_cost_vec.create (2);
1991       epilogue_cost_vec.create (2);
1992
1993       int dummy2;
1994       nopeel_outside_cost += vect_get_known_peeling_cost
1995         (loop_vinfo, 0, &dummy2,
1996          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1997          &prologue_cost_vec, &epilogue_cost_vec);
1998
1999       prologue_cost_vec.release ();
2000       epilogue_cost_vec.release ();
2001
2002       npeel = best_peel.peel_info.npeel;
2003       dr0 = best_peel.peel_info.dr;
2004
2005       /* If no peeling is not more expensive than the best peeling we
2006          have so far, don't perform any peeling.  */
2007       if (nopeel_inside_cost <= best_peel.inside_cost)
2008         do_peeling = false;
2009     }
2010
2011   if (do_peeling)
2012     {
2013       stmt_vec_info stmt_info = vect_dr_stmt (dr0);
2014       vectype = STMT_VINFO_VECTYPE (stmt_info);
2015
2016       if (known_alignment_for_access_p (dr0))
2017         {
2018           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2019                                                 size_zero_node) < 0;
2020           if (!npeel)
2021             {
2022               /* Since it's known at compile time, compute the number of
2023                  iterations in the peeled loop (the peeling factor) for use in
2024                  updating DR_MISALIGNMENT values.  The peeling factor is the
2025                  vectorization factor minus the misalignment as an element
2026                  count.  */
2027               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2028               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2029               npeel = ((mis & (target_align - 1))
2030                        / vect_get_scalar_dr_size (dr0));
2031             }
2032
2033           /* For interleaved data access every iteration accesses all the
2034              members of the group, therefore we divide the number of iterations
2035              by the group size.  */
2036           stmt_info = vect_dr_stmt (dr0);
2037           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038             npeel /= DR_GROUP_SIZE (stmt_info);
2039
2040           if (dump_enabled_p ())
2041             dump_printf_loc (MSG_NOTE, vect_location,
2042                              "Try peeling by %d\n", npeel);
2043         }
2044
2045       /* Ensure that all datarefs can be vectorized after the peel.  */
2046       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2047         do_peeling = false;
2048
2049       /* Check if all datarefs are supportable and log.  */
2050       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2051         {
2052           stat = vect_verify_datarefs_alignment (loop_vinfo);
2053           if (!stat)
2054             do_peeling = false;
2055           else
2056             return stat;
2057         }
2058
2059       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2060       if (do_peeling)
2061         {
2062           unsigned max_allowed_peel
2063             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2064           if (max_allowed_peel != (unsigned)-1)
2065             {
2066               unsigned max_peel = npeel;
2067               if (max_peel == 0)
2068                 {
2069                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2070                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2071                 }
2072               if (max_peel > max_allowed_peel)
2073                 {
2074                   do_peeling = false;
2075                   if (dump_enabled_p ())
2076                     dump_printf_loc (MSG_NOTE, vect_location,
2077                         "Disable peeling, max peels reached: %d\n", max_peel);
2078                 }
2079             }
2080         }
2081
2082       /* Cost model #2 - if peeling may result in a remaining loop not
2083          iterating enough to be vectorized then do not peel.  Since this
2084          is a cost heuristic rather than a correctness decision, use the
2085          most likely runtime value for variable vectorization factors.  */
2086       if (do_peeling
2087           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2088         {
2089           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2090           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2091           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2092               < assumed_vf + max_peel)
2093             do_peeling = false;
2094         }
2095
2096       if (do_peeling)
2097         {
2098           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2099              If the misalignment of DR_i is identical to that of dr0 then set
2100              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2101              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2102              by the peeling factor times the element size of DR_i (MOD the
2103              vectorization factor times the size).  Otherwise, the
2104              misalignment of DR_i must be set to unknown.  */
2105           FOR_EACH_VEC_ELT (datarefs, i, dr)
2106             if (dr != dr0)
2107               {
2108                 /* Strided accesses perform only component accesses, alignment
2109                    is irrelevant for them.  */
2110                 stmt_info = vect_dr_stmt (dr);
2111                 if (STMT_VINFO_STRIDED_P (stmt_info)
2112                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2113                   continue;
2114
2115                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2116               }
2117
2118           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2119           if (npeel)
2120             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2121           else
2122             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2123               = DR_MISALIGNMENT (dr0);
2124           SET_DR_MISALIGNMENT (dr0, 0);
2125           if (dump_enabled_p ())
2126             {
2127               dump_printf_loc (MSG_NOTE, vect_location,
2128                                "Alignment of access forced using peeling.\n");
2129               dump_printf_loc (MSG_NOTE, vect_location,
2130                                "Peeling for alignment will be applied.\n");
2131             }
2132
2133           /* The inside-loop cost will be accounted for in vectorizable_load
2134              and vectorizable_store correctly with adjusted alignments.
2135              Drop the body_cst_vec on the floor here.  */
2136           stat = vect_verify_datarefs_alignment (loop_vinfo);
2137           gcc_assert (stat);
2138           return stat;
2139         }
2140     }
2141
2142   /* (2) Versioning to force alignment.  */
2143
2144   /* Try versioning if:
2145      1) optimize loop for speed
2146      2) there is at least one unsupported misaligned data ref with an unknown
2147         misalignment, and
2148      3) all misaligned data refs with a known misalignment are supported, and
2149      4) the number of runtime alignment checks is within reason.  */
2150
2151   do_versioning =
2152         optimize_loop_nest_for_speed_p (loop)
2153         && (!loop->inner); /* FORNOW */
2154
2155   if (do_versioning)
2156     {
2157       FOR_EACH_VEC_ELT (datarefs, i, dr)
2158         {
2159           stmt_vec_info stmt_info = vect_dr_stmt (dr);
2160
2161           /* For interleaving, only the alignment of the first access
2162              matters.  */
2163           if (aligned_access_p (dr)
2164               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2165                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2166             continue;
2167
2168           if (STMT_VINFO_STRIDED_P (stmt_info))
2169             {
2170               /* Strided loads perform only component accesses, alignment is
2171                  irrelevant for them.  */
2172               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2173                 continue;
2174               do_versioning = false;
2175               break;
2176             }
2177
2178           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2179
2180           if (!supportable_dr_alignment)
2181             {
2182               int mask;
2183               tree vectype;
2184
2185               if (known_alignment_for_access_p (dr)
2186                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2187                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2188                 {
2189                   do_versioning = false;
2190                   break;
2191                 }
2192
2193               stmt_info = vect_dr_stmt (dr);
2194               vectype = STMT_VINFO_VECTYPE (stmt_info);
2195               gcc_assert (vectype);
2196
2197               /* At present we don't support versioning for alignment
2198                  with variable VF, since there's no guarantee that the
2199                  VF is a power of two.  We could relax this if we added
2200                  a way of enforcing a power-of-two size.  */
2201               unsigned HOST_WIDE_INT size;
2202               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2203                 {
2204                   do_versioning = false;
2205                   break;
2206                 }
2207
2208               /* The rightmost bits of an aligned address must be zeros.
2209                  Construct the mask needed for this test.  For example,
2210                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2211                  mask must be 15 = 0xf. */
2212               mask = size - 1;
2213
2214               /* FORNOW: use the same mask to test all potentially unaligned
2215                  references in the loop.  The vectorizer currently supports
2216                  a single vector size, see the reference to
2217                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2218                  vectorization factor is computed.  */
2219               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2220                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2221               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2222               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2223             }
2224         }
2225
2226       /* Versioning requires at least one misaligned data reference.  */
2227       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2228         do_versioning = false;
2229       else if (!do_versioning)
2230         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2231     }
2232
2233   if (do_versioning)
2234     {
2235       vec<stmt_vec_info> may_misalign_stmts
2236         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2237       stmt_vec_info stmt_info;
2238
2239       /* It can now be assumed that the data references in the statements
2240          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2241          of the loop being vectorized.  */
2242       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2243         {
2244           dr = STMT_VINFO_DATA_REF (stmt_info);
2245           SET_DR_MISALIGNMENT (dr, 0);
2246           if (dump_enabled_p ())
2247             dump_printf_loc (MSG_NOTE, vect_location,
2248                              "Alignment of access forced using versioning.\n");
2249         }
2250
2251       if (dump_enabled_p ())
2252         dump_printf_loc (MSG_NOTE, vect_location,
2253                          "Versioning for alignment will be applied.\n");
2254
2255       /* Peeling and versioning can't be done together at this time.  */
2256       gcc_assert (! (do_peeling && do_versioning));
2257
2258       stat = vect_verify_datarefs_alignment (loop_vinfo);
2259       gcc_assert (stat);
2260       return stat;
2261     }
2262
2263   /* This point is reached if neither peeling nor versioning is being done.  */
2264   gcc_assert (! (do_peeling || do_versioning));
2265
2266   stat = vect_verify_datarefs_alignment (loop_vinfo);
2267   return stat;
2268 }
2269
2270
2271 /* Function vect_find_same_alignment_drs.
2272
2273    Update group and alignment relations according to the chosen
2274    vectorization factor.  */
2275
2276 static void
2277 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2278 {
2279   struct data_reference *dra = DDR_A (ddr);
2280   struct data_reference *drb = DDR_B (ddr);
2281   stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2282   stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2283
2284   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2285     return;
2286
2287   if (dra == drb)
2288     return;
2289
2290   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2291       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2292     return;
2293
2294   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2295       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2296       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2297     return;
2298
2299   /* Two references with distance zero have the same alignment.  */
2300   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2301                           - wi::to_poly_offset (DR_INIT (drb)));
2302   if (maybe_ne (diff, 0))
2303     {
2304       /* Get the wider of the two alignments.  */
2305       unsigned int align_a = (vect_calculate_target_alignment (dra)
2306                               / BITS_PER_UNIT);
2307       unsigned int align_b = (vect_calculate_target_alignment (drb)
2308                               / BITS_PER_UNIT);
2309       unsigned int max_align = MAX (align_a, align_b);
2310
2311       /* Require the gap to be a multiple of the larger vector alignment.  */
2312       if (!multiple_p (diff, max_align))
2313         return;
2314     }
2315
2316   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2317   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2318   if (dump_enabled_p ())
2319     {
2320       dump_printf_loc (MSG_NOTE, vect_location,
2321                        "accesses have the same alignment: ");
2322       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2323       dump_printf (MSG_NOTE,  " and ");
2324       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2325       dump_printf (MSG_NOTE, "\n");
2326     }
2327 }
2328
2329
2330 /* Function vect_analyze_data_refs_alignment
2331
2332    Analyze the alignment of the data-references in the loop.
2333    Return FALSE if a data reference is found that cannot be vectorized.  */
2334
2335 bool
2336 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2337 {
2338   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2339
2340   /* Mark groups of data references with same alignment using
2341      data dependence information.  */
2342   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2343   struct data_dependence_relation *ddr;
2344   unsigned int i;
2345
2346   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2347     vect_find_same_alignment_drs (ddr);
2348
2349   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2350   struct data_reference *dr;
2351
2352   vect_record_base_alignments (vinfo);
2353   FOR_EACH_VEC_ELT (datarefs, i, dr)
2354     {
2355       stmt_vec_info stmt_info = vect_dr_stmt (dr);
2356       if (STMT_VINFO_VECTORIZABLE (stmt_info))
2357         vect_compute_data_ref_alignment (dr);
2358     }
2359
2360   return true;
2361 }
2362
2363
2364 /* Analyze alignment of DRs of stmts in NODE.  */
2365
2366 static bool
2367 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2368 {
2369   /* We vectorize from the first scalar stmt in the node unless
2370      the node is permuted in which case we start from the first
2371      element in the group.  */
2372   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2373   data_reference_p first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
2374   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2375     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2376
2377   data_reference_p dr = STMT_VINFO_DATA_REF (first_stmt_info);
2378   vect_compute_data_ref_alignment (dr);
2379   /* For creating the data-ref pointer we need alignment of the
2380      first element anyway.  */
2381   if (dr != first_dr)
2382     vect_compute_data_ref_alignment (first_dr);
2383   if (! verify_data_ref_alignment (dr))
2384     {
2385       if (dump_enabled_p ())
2386         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2387                          "not vectorized: bad data alignment in basic "
2388                          "block.\n");
2389       return false;
2390     }
2391
2392   return true;
2393 }
2394
2395 /* Function vect_slp_analyze_instance_alignment
2396
2397    Analyze the alignment of the data-references in the SLP instance.
2398    Return FALSE if a data reference is found that cannot be vectorized.  */
2399
2400 bool
2401 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2402 {
2403   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2404
2405   slp_tree node;
2406   unsigned i;
2407   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2408     if (! vect_slp_analyze_and_verify_node_alignment (node))
2409       return false;
2410
2411   node = SLP_INSTANCE_TREE (instance);
2412   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2413       && ! vect_slp_analyze_and_verify_node_alignment
2414              (SLP_INSTANCE_TREE (instance)))
2415     return false;
2416
2417   return true;
2418 }
2419
2420
2421 /* Analyze groups of accesses: check that DR belongs to a group of
2422    accesses of legal size, step, etc.  Detect gaps, single element
2423    interleaving, and other special cases. Set grouped access info.
2424    Collect groups of strided stores for further use in SLP analysis.
2425    Worker for vect_analyze_group_access.  */
2426
2427 static bool
2428 vect_analyze_group_access_1 (struct data_reference *dr)
2429 {
2430   tree step = DR_STEP (dr);
2431   tree scalar_type = TREE_TYPE (DR_REF (dr));
2432   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2433   stmt_vec_info stmt_info = vect_dr_stmt (dr);
2434   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2435   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2436   HOST_WIDE_INT dr_step = -1;
2437   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2438   bool slp_impossible = false;
2439
2440   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2441      size of the interleaving group (including gaps).  */
2442   if (tree_fits_shwi_p (step))
2443     {
2444       dr_step = tree_to_shwi (step);
2445       /* Check that STEP is a multiple of type size.  Otherwise there is
2446          a non-element-sized gap at the end of the group which we
2447          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2448          ???  As we can handle non-constant step fine here we should
2449          simply remove uses of DR_GROUP_GAP between the last and first
2450          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2451          simply not include that gap.  */
2452       if ((dr_step % type_size) != 0)
2453         {
2454           if (dump_enabled_p ())
2455             {
2456               dump_printf_loc (MSG_NOTE, vect_location,
2457                                "Step ");
2458               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2459               dump_printf (MSG_NOTE,
2460                            " is not a multiple of the element size for ");
2461               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2462               dump_printf (MSG_NOTE, "\n");
2463             }
2464           return false;
2465         }
2466       groupsize = absu_hwi (dr_step) / type_size;
2467     }
2468   else
2469     groupsize = 0;
2470
2471   /* Not consecutive access is possible only if it is a part of interleaving.  */
2472   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2473     {
2474       /* Check if it this DR is a part of interleaving, and is a single
2475          element of the group that is accessed in the loop.  */
2476
2477       /* Gaps are supported only for loads. STEP must be a multiple of the type
2478          size.  */
2479       if (DR_IS_READ (dr)
2480           && (dr_step % type_size) == 0
2481           && groupsize > 0)
2482         {
2483           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2484           DR_GROUP_SIZE (stmt_info) = groupsize;
2485           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2486           if (dump_enabled_p ())
2487             {
2488               dump_printf_loc (MSG_NOTE, vect_location,
2489                                "Detected single element interleaving ");
2490               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2491               dump_printf (MSG_NOTE, " step ");
2492               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2493               dump_printf (MSG_NOTE, "\n");
2494             }
2495
2496           return true;
2497         }
2498
2499       if (dump_enabled_p ())
2500         {
2501           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2502                            "not consecutive access ");
2503           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2504                             stmt_info->stmt, 0);
2505         }
2506
2507       if (bb_vinfo)
2508         {
2509           /* Mark the statement as unvectorizable.  */
2510           STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
2511           return true;
2512         }
2513
2514       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2515       STMT_VINFO_STRIDED_P (stmt_info) = true;
2516       return true;
2517     }
2518
2519   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2520     {
2521       /* First stmt in the interleaving chain. Check the chain.  */
2522       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2523       struct data_reference *data_ref = dr;
2524       unsigned int count = 1;
2525       tree prev_init = DR_INIT (data_ref);
2526       stmt_vec_info prev = stmt_info;
2527       HOST_WIDE_INT diff, gaps = 0;
2528
2529       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2530       while (next)
2531         {
2532           /* Skip same data-refs.  In case that two or more stmts share
2533              data-ref (supported only for loads), we vectorize only the first
2534              stmt, and the rest get their vectorized loads from the first
2535              one.  */
2536           if (!tree_int_cst_compare (DR_INIT (data_ref),
2537                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2538             {
2539               if (DR_IS_WRITE (data_ref))
2540                 {
2541                   if (dump_enabled_p ())
2542                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2543                                      "Two store stmts share the same dr.\n");
2544                   return false;
2545                 }
2546
2547               if (dump_enabled_p ())
2548                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2549                                  "Two or more load stmts share the same dr.\n");
2550
2551               /* For load use the same data-ref load.  */
2552               DR_GROUP_SAME_DR_STMT (next) = prev;
2553
2554               prev = next;
2555               next = DR_GROUP_NEXT_ELEMENT (next);
2556               continue;
2557             }
2558
2559           prev = next;
2560           data_ref = STMT_VINFO_DATA_REF (next);
2561
2562           /* All group members have the same STEP by construction.  */
2563           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2564
2565           /* Check that the distance between two accesses is equal to the type
2566              size. Otherwise, we have gaps.  */
2567           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2568                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2569           if (diff != 1)
2570             {
2571               /* FORNOW: SLP of accesses with gaps is not supported.  */
2572               slp_impossible = true;
2573               if (DR_IS_WRITE (data_ref))
2574                 {
2575                   if (dump_enabled_p ())
2576                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2577                                      "interleaved store with gaps\n");
2578                   return false;
2579                 }
2580
2581               gaps += diff - 1;
2582             }
2583
2584           last_accessed_element += diff;
2585
2586           /* Store the gap from the previous member of the group. If there is no
2587              gap in the access, DR_GROUP_GAP is always 1.  */
2588           DR_GROUP_GAP (next) = diff;
2589
2590           prev_init = DR_INIT (data_ref);
2591           next = DR_GROUP_NEXT_ELEMENT (next);
2592           /* Count the number of data-refs in the chain.  */
2593           count++;
2594         }
2595
2596       if (groupsize == 0)
2597         groupsize = count + gaps;
2598
2599       /* This could be UINT_MAX but as we are generating code in a very
2600          inefficient way we have to cap earlier.  See PR78699 for example.  */
2601       if (groupsize > 4096)
2602         {
2603           if (dump_enabled_p ())
2604             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2605                              "group is too large\n");
2606           return false;
2607         }
2608
2609       /* Check that the size of the interleaving is equal to count for stores,
2610          i.e., that there are no gaps.  */
2611       if (groupsize != count
2612           && !DR_IS_READ (dr))
2613         {
2614           if (dump_enabled_p ())
2615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616                              "interleaved store with gaps\n");
2617           return false;
2618         }
2619
2620       /* If there is a gap after the last load in the group it is the
2621          difference between the groupsize and the last accessed
2622          element.
2623          When there is no gap, this difference should be 0.  */
2624       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2625
2626       DR_GROUP_SIZE (stmt_info) = groupsize;
2627       if (dump_enabled_p ())
2628         {
2629           dump_printf_loc (MSG_NOTE, vect_location,
2630                            "Detected interleaving ");
2631           if (DR_IS_READ (dr))
2632             dump_printf (MSG_NOTE, "load ");
2633           else
2634             dump_printf (MSG_NOTE, "store ");
2635           dump_printf (MSG_NOTE, "of size %u starting with ",
2636                        (unsigned)groupsize);
2637           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2638           if (DR_GROUP_GAP (stmt_info) != 0)
2639             dump_printf_loc (MSG_NOTE, vect_location,
2640                              "There is a gap of %u elements after the group\n",
2641                              DR_GROUP_GAP (stmt_info));
2642         }
2643
2644       /* SLP: create an SLP data structure for every interleaving group of
2645          stores for further analysis in vect_analyse_slp.  */
2646       if (DR_IS_WRITE (dr) && !slp_impossible)
2647         {
2648           if (loop_vinfo)
2649             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2650           if (bb_vinfo)
2651             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2652         }
2653     }
2654
2655   return true;
2656 }
2657
2658 /* Analyze groups of accesses: check that DR belongs to a group of
2659    accesses of legal size, step, etc.  Detect gaps, single element
2660    interleaving, and other special cases. Set grouped access info.
2661    Collect groups of strided stores for further use in SLP analysis.  */
2662
2663 static bool
2664 vect_analyze_group_access (struct data_reference *dr)
2665 {
2666   if (!vect_analyze_group_access_1 (dr))
2667     {
2668       /* Dissolve the group if present.  */
2669       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dr));
2670       while (stmt_info)
2671         {
2672           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2673           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2674           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2675           stmt_info = next;
2676         }
2677       return false;
2678     }
2679   return true;
2680 }
2681
2682 /* Analyze the access pattern of the data-reference DR.
2683    In case of non-consecutive accesses call vect_analyze_group_access() to
2684    analyze groups of accesses.  */
2685
2686 static bool
2687 vect_analyze_data_ref_access (struct data_reference *dr)
2688 {
2689   tree step = DR_STEP (dr);
2690   tree scalar_type = TREE_TYPE (DR_REF (dr));
2691   stmt_vec_info stmt_info = vect_dr_stmt (dr);
2692   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2693   struct loop *loop = NULL;
2694
2695   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2696     return true;
2697
2698   if (loop_vinfo)
2699     loop = LOOP_VINFO_LOOP (loop_vinfo);
2700
2701   if (loop_vinfo && !step)
2702     {
2703       if (dump_enabled_p ())
2704         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2705                          "bad data-ref access in loop\n");
2706       return false;
2707     }
2708
2709   /* Allow loads with zero step in inner-loop vectorization.  */
2710   if (loop_vinfo && integer_zerop (step))
2711     {
2712       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2713       if (!nested_in_vect_loop_p (loop, stmt_info))
2714         return DR_IS_READ (dr);
2715       /* Allow references with zero step for outer loops marked
2716          with pragma omp simd only - it guarantees absence of
2717          loop-carried dependencies between inner loop iterations.  */
2718       if (loop->safelen < 2)
2719         {
2720           if (dump_enabled_p ())
2721             dump_printf_loc (MSG_NOTE, vect_location,
2722                              "zero step in inner loop of nest\n");
2723           return false;
2724         }
2725     }
2726
2727   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2728     {
2729       /* Interleaved accesses are not yet supported within outer-loop
2730         vectorization for references in the inner-loop.  */
2731       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2732
2733       /* For the rest of the analysis we use the outer-loop step.  */
2734       step = STMT_VINFO_DR_STEP (stmt_info);
2735       if (integer_zerop (step))
2736         {
2737           if (dump_enabled_p ())
2738             dump_printf_loc (MSG_NOTE, vect_location,
2739                              "zero step in outer loop.\n");
2740           return DR_IS_READ (dr);
2741         }
2742     }
2743
2744   /* Consecutive?  */
2745   if (TREE_CODE (step) == INTEGER_CST)
2746     {
2747       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2748       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2749           || (dr_step < 0
2750               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2751         {
2752           /* Mark that it is not interleaving.  */
2753           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2754           return true;
2755         }
2756     }
2757
2758   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2759     {
2760       if (dump_enabled_p ())
2761         dump_printf_loc (MSG_NOTE, vect_location,
2762                          "grouped access in outer loop.\n");
2763       return false;
2764     }
2765
2766
2767   /* Assume this is a DR handled by non-constant strided load case.  */
2768   if (TREE_CODE (step) != INTEGER_CST)
2769     return (STMT_VINFO_STRIDED_P (stmt_info)
2770             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2771                 || vect_analyze_group_access (dr)));
2772
2773   /* Not consecutive access - check if it's a part of interleaving group.  */
2774   return vect_analyze_group_access (dr);
2775 }
2776
2777 /* Compare two data-references DRA and DRB to group them into chunks
2778    suitable for grouping.  */
2779
2780 static int
2781 dr_group_sort_cmp (const void *dra_, const void *drb_)
2782 {
2783   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2784   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2785   int cmp;
2786
2787   /* Stabilize sort.  */
2788   if (dra == drb)
2789     return 0;
2790
2791   /* DRs in different loops never belong to the same group.  */
2792   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2793   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2794   if (loopa != loopb)
2795     return loopa->num < loopb->num ? -1 : 1;
2796
2797   /* Ordering of DRs according to base.  */
2798   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2799                                DR_BASE_ADDRESS (drb));
2800   if (cmp != 0)
2801     return cmp;
2802
2803   /* And according to DR_OFFSET.  */
2804   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2805   if (cmp != 0)
2806     return cmp;
2807
2808   /* Put reads before writes.  */
2809   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2810     return DR_IS_READ (dra) ? -1 : 1;
2811
2812   /* Then sort after access size.  */
2813   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2814                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2815   if (cmp != 0)
2816     return cmp;
2817
2818   /* And after step.  */
2819   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2820   if (cmp != 0)
2821     return cmp;
2822
2823   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2824   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2825   if (cmp == 0)
2826     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2827   return cmp;
2828 }
2829
2830 /* If OP is the result of a conversion, return the unconverted value,
2831    otherwise return null.  */
2832
2833 static tree
2834 strip_conversion (tree op)
2835 {
2836   if (TREE_CODE (op) != SSA_NAME)
2837     return NULL_TREE;
2838   gimple *stmt = SSA_NAME_DEF_STMT (op);
2839   if (!is_gimple_assign (stmt)
2840       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2841     return NULL_TREE;
2842   return gimple_assign_rhs1 (stmt);
2843 }
2844
2845 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2846    and STMT2_INFO being in a single group.  */
2847
2848 static bool
2849 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2850 {
2851   if (gimple_assign_single_p (stmt1_info->stmt))
2852     return gimple_assign_single_p (stmt2_info->stmt);
2853
2854   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2855   if (call1 && gimple_call_internal_p (call1))
2856     {
2857       /* Check for two masked loads or two masked stores.  */
2858       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2859       if (!call2 || !gimple_call_internal_p (call2))
2860         return false;
2861       internal_fn ifn = gimple_call_internal_fn (call1);
2862       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2863         return false;
2864       if (ifn != gimple_call_internal_fn (call2))
2865         return false;
2866
2867       /* Check that the masks are the same.  Cope with casts of masks,
2868          like those created by build_mask_conversion.  */
2869       tree mask1 = gimple_call_arg (call1, 2);
2870       tree mask2 = gimple_call_arg (call2, 2);
2871       if (!operand_equal_p (mask1, mask2, 0))
2872         {
2873           mask1 = strip_conversion (mask1);
2874           if (!mask1)
2875             return false;
2876           mask2 = strip_conversion (mask2);
2877           if (!mask2)
2878             return false;
2879           if (!operand_equal_p (mask1, mask2, 0))
2880             return false;
2881         }
2882       return true;
2883     }
2884
2885   return false;
2886 }
2887
2888 /* Function vect_analyze_data_ref_accesses.
2889
2890    Analyze the access pattern of all the data references in the loop.
2891
2892    FORNOW: the only access pattern that is considered vectorizable is a
2893            simple step 1 (consecutive) access.
2894
2895    FORNOW: handle only arrays and pointer accesses.  */
2896
2897 bool
2898 vect_analyze_data_ref_accesses (vec_info *vinfo)
2899 {
2900   unsigned int i;
2901   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2902   struct data_reference *dr;
2903
2904   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2905
2906   if (datarefs.is_empty ())
2907     return true;
2908
2909   /* Sort the array of datarefs to make building the interleaving chains
2910      linear.  Don't modify the original vector's order, it is needed for
2911      determining what dependencies are reversed.  */
2912   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2913   datarefs_copy.qsort (dr_group_sort_cmp);
2914
2915   /* Build the interleaving chains.  */
2916   for (i = 0; i < datarefs_copy.length () - 1;)
2917     {
2918       data_reference_p dra = datarefs_copy[i];
2919       stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2920       stmt_vec_info lastinfo = NULL;
2921       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2922           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2923         {
2924           ++i;
2925           continue;
2926         }
2927       for (i = i + 1; i < datarefs_copy.length (); ++i)
2928         {
2929           data_reference_p drb = datarefs_copy[i];
2930           stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2931           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2932               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2933             break;
2934
2935           /* ???  Imperfect sorting (non-compatible types, non-modulo
2936              accesses, same accesses) can lead to a group to be artificially
2937              split here as we don't just skip over those.  If it really
2938              matters we can push those to a worklist and re-iterate
2939              over them.  The we can just skip ahead to the next DR here.  */
2940
2941           /* DRs in a different loop should not be put into the same
2942              interleaving group.  */
2943           if (gimple_bb (DR_STMT (dra))->loop_father
2944               != gimple_bb (DR_STMT (drb))->loop_father)
2945             break;
2946
2947           /* Check that the data-refs have same first location (except init)
2948              and they are both either store or load (not load and store,
2949              not masked loads or stores).  */
2950           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2951               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2952                                         DR_BASE_ADDRESS (drb)) != 0
2953               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2954               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2955             break;
2956
2957           /* Check that the data-refs have the same constant size.  */
2958           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2959           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2960           if (!tree_fits_uhwi_p (sza)
2961               || !tree_fits_uhwi_p (szb)
2962               || !tree_int_cst_equal (sza, szb))
2963             break;
2964
2965           /* Check that the data-refs have the same step.  */
2966           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2967             break;
2968
2969           /* Check the types are compatible.
2970              ???  We don't distinguish this during sorting.  */
2971           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2972                                    TREE_TYPE (DR_REF (drb))))
2973             break;
2974
2975           /* Check that the DR_INITs are compile-time constants.  */
2976           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2977               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2978             break;
2979
2980           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2981           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2982           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2983           HOST_WIDE_INT init_prev
2984             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2985           gcc_assert (init_a <= init_b
2986                       && init_a <= init_prev
2987                       && init_prev <= init_b);
2988
2989           /* Do not place the same access in the interleaving chain twice.  */
2990           if (init_b == init_prev)
2991             {
2992               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2993                           < gimple_uid (DR_STMT (drb)));
2994               /* ???  For now we simply "drop" the later reference which is
2995                  otherwise the same rather than finishing off this group.
2996                  In the end we'd want to re-process duplicates forming
2997                  multiple groups from the refs, likely by just collecting
2998                  all candidates (including duplicates and split points
2999                  below) in a vector and then process them together.  */
3000               continue;
3001             }
3002
3003           /* If init_b == init_a + the size of the type * k, we have an
3004              interleaving, and DRA is accessed before DRB.  */
3005           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3006           if (type_size_a == 0
3007               || (init_b - init_a) % type_size_a != 0)
3008             break;
3009
3010           /* If we have a store, the accesses are adjacent.  This splits
3011              groups into chunks we support (we don't support vectorization
3012              of stores with gaps).  */
3013           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3014             break;
3015
3016           /* If the step (if not zero or non-constant) is greater than the
3017              difference between data-refs' inits this splits groups into
3018              suitable sizes.  */
3019           if (tree_fits_shwi_p (DR_STEP (dra)))
3020             {
3021               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3022               if (step != 0 && step <= (init_b - init_a))
3023                 break;
3024             }
3025
3026           if (dump_enabled_p ())
3027             {
3028               dump_printf_loc (MSG_NOTE, vect_location,
3029                                "Detected interleaving ");
3030               if (DR_IS_READ (dra))
3031                 dump_printf (MSG_NOTE, "load ");
3032               else
3033                 dump_printf (MSG_NOTE, "store ");
3034               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3035               dump_printf (MSG_NOTE,  " and ");
3036               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3037               dump_printf (MSG_NOTE, "\n");
3038             }
3039
3040           /* Link the found element into the group list.  */
3041           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3042             {
3043               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3044               lastinfo = stmtinfo_a;
3045             }
3046           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3047           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3048           lastinfo = stmtinfo_b;
3049         }
3050     }
3051
3052   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3053     if (STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr))
3054         && !vect_analyze_data_ref_access (dr))
3055       {
3056         if (dump_enabled_p ())
3057           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058                            "not vectorized: complicated access pattern.\n");
3059
3060         if (is_a <bb_vec_info> (vinfo))
3061           {
3062             /* Mark the statement as not vectorizable.  */
3063             STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
3064             continue;
3065           }
3066         else
3067           {
3068             datarefs_copy.release ();
3069             return false;
3070           }
3071       }
3072
3073   datarefs_copy.release ();
3074   return true;
3075 }
3076
3077 /* Function vect_vfa_segment_size.
3078
3079    Input:
3080      DR: The data reference.
3081      LENGTH_FACTOR: segment length to consider.
3082
3083    Return a value suitable for the dr_with_seg_len::seg_len field.
3084    This is the "distance travelled" by the pointer from the first
3085    iteration in the segment to the last.  Note that it does not include
3086    the size of the access; in effect it only describes the first byte.  */
3087
3088 static tree
3089 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3090 {
3091   length_factor = size_binop (MINUS_EXPR,
3092                               fold_convert (sizetype, length_factor),
3093                               size_one_node);
3094   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3095                      length_factor);
3096 }
3097
3098 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3099    gives the worst-case number of bytes covered by the segment.  */
3100
3101 static unsigned HOST_WIDE_INT
3102 vect_vfa_access_size (data_reference *dr)
3103 {
3104   stmt_vec_info stmt_vinfo = vect_dr_stmt (dr);
3105   tree ref_type = TREE_TYPE (DR_REF (dr));
3106   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3107   unsigned HOST_WIDE_INT access_size = ref_size;
3108   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3109     {
3110       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3111       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3112     }
3113   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3114       && (vect_supportable_dr_alignment (dr, false)
3115           == dr_explicit_realign_optimized))
3116     {
3117       /* We might access a full vector's worth.  */
3118       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3119       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3120     }
3121   return access_size;
3122 }
3123
3124 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3125
3126 static unsigned int
3127 vect_vfa_align (const data_reference *dr)
3128 {
3129   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3130 }
3131
3132 /* Function vect_no_alias_p.
3133
3134    Given data references A and B with equal base and offset, see whether
3135    the alias relation can be decided at compilation time.  Return 1 if
3136    it can and the references alias, 0 if it can and the references do
3137    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3138    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3139    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3140
3141 static int
3142 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3143                          tree segment_length_a, tree segment_length_b,
3144                          unsigned HOST_WIDE_INT access_size_a,
3145                          unsigned HOST_WIDE_INT access_size_b)
3146 {
3147   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3148   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3149   poly_uint64 const_length_a;
3150   poly_uint64 const_length_b;
3151
3152   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3153      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3154      [a, a+12) */
3155   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3156     {
3157       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3158       offset_a = (offset_a + access_size_a) - const_length_a;
3159     }
3160   else
3161     const_length_a = tree_to_poly_uint64 (segment_length_a);
3162   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3163     {
3164       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3165       offset_b = (offset_b + access_size_b) - const_length_b;
3166     }
3167   else
3168     const_length_b = tree_to_poly_uint64 (segment_length_b);
3169
3170   const_length_a += access_size_a;
3171   const_length_b += access_size_b;
3172
3173   if (ranges_known_overlap_p (offset_a, const_length_a,
3174                               offset_b, const_length_b))
3175     return 1;
3176
3177   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3178                                offset_b, const_length_b))
3179     return 0;
3180
3181   return -1;
3182 }
3183
3184 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3185    in DDR is >= VF.  */
3186
3187 static bool
3188 dependence_distance_ge_vf (data_dependence_relation *ddr,
3189                            unsigned int loop_depth, poly_uint64 vf)
3190 {
3191   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3192       || DDR_NUM_DIST_VECTS (ddr) == 0)
3193     return false;
3194
3195   /* If the dependence is exact, we should have limited the VF instead.  */
3196   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3197
3198   unsigned int i;
3199   lambda_vector dist_v;
3200   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3201     {
3202       HOST_WIDE_INT dist = dist_v[loop_depth];
3203       if (dist != 0
3204           && !(dist > 0 && DDR_REVERSED_P (ddr))
3205           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3206         return false;
3207     }
3208
3209   if (dump_enabled_p ())
3210     {
3211       dump_printf_loc (MSG_NOTE, vect_location,
3212                        "dependence distance between ");
3213       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3214       dump_printf (MSG_NOTE,  " and ");
3215       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3216       dump_printf (MSG_NOTE,  " is >= VF\n");
3217     }
3218
3219   return true;
3220 }
3221
3222 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3223
3224 static void
3225 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3226 {
3227   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3228   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3229   dump_printf (dump_kind, ") >= ");
3230   dump_dec (dump_kind, lower_bound.min_value);
3231 }
3232
3233 /* Record that the vectorized loop requires the vec_lower_bound described
3234    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3235
3236 static void
3237 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3238                         poly_uint64 min_value)
3239 {
3240   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3241   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3242     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3243       {
3244         unsigned_p &= lower_bounds[i].unsigned_p;
3245         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3246         if (lower_bounds[i].unsigned_p != unsigned_p
3247             || maybe_lt (lower_bounds[i].min_value, min_value))
3248           {
3249             lower_bounds[i].unsigned_p = unsigned_p;
3250             lower_bounds[i].min_value = min_value;
3251             if (dump_enabled_p ())
3252               {
3253                 dump_printf_loc (MSG_NOTE, vect_location,
3254                                  "updating run-time check to ");
3255                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3256                 dump_printf (MSG_NOTE, "\n");
3257               }
3258           }
3259         return;
3260       }
3261
3262   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3263   if (dump_enabled_p ())
3264     {
3265       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3266       dump_lower_bound (MSG_NOTE, lower_bound);
3267       dump_printf (MSG_NOTE, "\n");
3268     }
3269   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3270 }
3271
3272 /* Return true if it's unlikely that the step of the vectorized form of DR
3273    will span fewer than GAP bytes.  */
3274
3275 static bool
3276 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3277 {
3278   stmt_vec_info stmt_info = vect_dr_stmt (dr);
3279   HOST_WIDE_INT count
3280     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3281   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3282     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3283   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3284 }
3285
3286 /* Return true if we know that there is no alias between DR_A and DR_B
3287    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3288    *LOWER_BOUND_OUT to this N.  */
3289
3290 static bool
3291 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3292                                 poly_uint64 *lower_bound_out)
3293 {
3294   /* Check that there is a constant gap of known sign between DR_A
3295      and DR_B.  */
3296   poly_int64 init_a, init_b;
3297   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3298       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3299       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3300       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3301       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3302       || !ordered_p (init_a, init_b))
3303     return false;
3304
3305   /* Sort DR_A and DR_B by the address they access.  */
3306   if (maybe_lt (init_b, init_a))
3307     {
3308       std::swap (init_a, init_b);
3309       std::swap (dr_a, dr_b);
3310     }
3311
3312   /* If the two accesses could be dependent within a scalar iteration,
3313      make sure that we'd retain their order.  */
3314   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3315       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3316                                          vect_dr_stmt (dr_b)))
3317     return false;
3318
3319   /* There is no alias if abs (DR_STEP) is greater than or equal to
3320      the bytes spanned by the combination of the two accesses.  */
3321   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3322   return true;
3323 }
3324
3325 /* Function vect_prune_runtime_alias_test_list.
3326
3327    Prune a list of ddrs to be tested at run-time by versioning for alias.
3328    Merge several alias checks into one if possible.
3329    Return FALSE if resulting list of ddrs is longer then allowed by
3330    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3331
3332 bool
3333 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3334 {
3335   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3336   hash_set <tree_pair_hash> compared_objects;
3337
3338   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3339   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3340     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3341   vec<vec_object_pair> &check_unequal_addrs
3342     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3343   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3344   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3345
3346   ddr_p ddr;
3347   unsigned int i;
3348   tree length_factor;
3349
3350   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3351
3352   /* Step values are irrelevant for aliasing if the number of vector
3353      iterations is equal to the number of scalar iterations (which can
3354      happen for fully-SLP loops).  */
3355   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3356
3357   if (!ignore_step_p)
3358     {
3359       /* Convert the checks for nonzero steps into bound tests.  */
3360       tree value;
3361       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3362         vect_check_lower_bound (loop_vinfo, value, true, 1);
3363     }
3364
3365   if (may_alias_ddrs.is_empty ())
3366     return true;
3367
3368   comp_alias_ddrs.create (may_alias_ddrs.length ());
3369
3370   unsigned int loop_depth
3371     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3372                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3373
3374   /* First, we collect all data ref pairs for aliasing checks.  */
3375   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3376     {
3377       int comp_res;
3378       poly_uint64 lower_bound;
3379       struct data_reference *dr_a, *dr_b;
3380       tree segment_length_a, segment_length_b;
3381       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3382       unsigned int align_a, align_b;
3383
3384       /* Ignore the alias if the VF we chose ended up being no greater
3385          than the dependence distance.  */
3386       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3387         continue;
3388
3389       if (DDR_OBJECT_A (ddr))
3390         {
3391           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3392           if (!compared_objects.add (new_pair))
3393             {
3394               if (dump_enabled_p ())
3395                 {
3396                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3397                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3398                   dump_printf (MSG_NOTE, " and ");
3399                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3400                   dump_printf (MSG_NOTE, " have different addresses\n");
3401                 }
3402               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3403             }
3404           continue;
3405         }
3406
3407       dr_a = DDR_A (ddr);
3408       stmt_vec_info stmt_info_a = vect_dr_stmt (DDR_A (ddr));
3409
3410       dr_b = DDR_B (ddr);
3411       stmt_vec_info stmt_info_b = vect_dr_stmt (DDR_B (ddr));
3412
3413       /* Skip the pair if inter-iteration dependencies are irrelevant
3414          and intra-iteration dependencies are guaranteed to be honored.  */
3415       if (ignore_step_p
3416           && (vect_preserves_scalar_order_p (stmt_info_a, stmt_info_b)
3417               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3418         {
3419           if (dump_enabled_p ())
3420             {
3421               dump_printf_loc (MSG_NOTE, vect_location,
3422                                "no need for alias check between ");
3423               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3424               dump_printf (MSG_NOTE, " and ");
3425               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3426               dump_printf (MSG_NOTE, " when VF is 1\n");
3427             }
3428           continue;
3429         }
3430
3431       /* See whether we can handle the alias using a bounds check on
3432          the step, and whether that's likely to be the best approach.
3433          (It might not be, for example, if the minimum step is much larger
3434          than the number of bytes handled by one vector iteration.)  */
3435       if (!ignore_step_p
3436           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3437           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3438           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3439               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3440         {
3441           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3442           if (dump_enabled_p ())
3443             {
3444               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3445               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3446               dump_printf (MSG_NOTE, " and ");
3447               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3448               dump_printf (MSG_NOTE, " when the step ");
3449               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3450               dump_printf (MSG_NOTE, " is outside ");
3451               if (unsigned_p)
3452                 dump_printf (MSG_NOTE, "[0");
3453               else
3454                 {
3455                   dump_printf (MSG_NOTE, "(");
3456                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3457                 }
3458               dump_printf (MSG_NOTE, ", ");
3459               dump_dec (MSG_NOTE, lower_bound);
3460               dump_printf (MSG_NOTE, ")\n");
3461             }
3462           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3463                                   lower_bound);
3464           continue;
3465         }
3466
3467       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3468       if (dr_group_first_a)
3469         {
3470           stmt_info_a = dr_group_first_a;
3471           dr_a = STMT_VINFO_DATA_REF (stmt_info_a);
3472         }
3473
3474       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3475       if (dr_group_first_b)
3476         {
3477           stmt_info_b = dr_group_first_b;
3478           dr_b = STMT_VINFO_DATA_REF (stmt_info_b);
3479         }
3480
3481       if (ignore_step_p)
3482         {
3483           segment_length_a = size_zero_node;
3484           segment_length_b = size_zero_node;
3485         }
3486       else
3487         {
3488           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3489             length_factor = scalar_loop_iters;
3490           else
3491             length_factor = size_int (vect_factor);
3492           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3493           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3494         }
3495       access_size_a = vect_vfa_access_size (dr_a);
3496       access_size_b = vect_vfa_access_size (dr_b);
3497       align_a = vect_vfa_align (dr_a);
3498       align_b = vect_vfa_align (dr_b);
3499
3500       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3501                                         DR_BASE_ADDRESS (dr_b));
3502       if (comp_res == 0)
3503         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3504                                           DR_OFFSET (dr_b));
3505
3506       /* See whether the alias is known at compilation time.  */
3507       if (comp_res == 0
3508           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3509           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3510           && poly_int_tree_p (segment_length_a)
3511           && poly_int_tree_p (segment_length_b))
3512         {
3513           int res = vect_compile_time_alias (dr_a, dr_b,
3514                                              segment_length_a,
3515                                              segment_length_b,
3516                                              access_size_a,
3517                                              access_size_b);
3518           if (res >= 0 && dump_enabled_p ())
3519             {
3520               dump_printf_loc (MSG_NOTE, vect_location,
3521                                "can tell at compile time that ");
3522               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3523               dump_printf (MSG_NOTE, " and ");
3524               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3525               if (res == 0)
3526                 dump_printf (MSG_NOTE, " do not alias\n");
3527               else
3528                 dump_printf (MSG_NOTE, " alias\n");
3529             }
3530
3531           if (res == 0)
3532             continue;
3533
3534           if (res == 1)
3535             {
3536               if (dump_enabled_p ())
3537                 dump_printf_loc (MSG_NOTE, vect_location,
3538                                  "not vectorized: compilation time alias.\n");
3539               return false;
3540             }
3541         }
3542
3543       dr_with_seg_len_pair_t dr_with_seg_len_pair
3544         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3545          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3546
3547       /* Canonicalize pairs by sorting the two DR members.  */
3548       if (comp_res > 0)
3549         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3550
3551       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3552     }
3553
3554   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3555
3556   unsigned int count = (comp_alias_ddrs.length ()
3557                         + check_unequal_addrs.length ());
3558
3559   dump_printf_loc (MSG_NOTE, vect_location,
3560                    "improved number of alias checks from %d to %d\n",
3561                    may_alias_ddrs.length (), count);
3562   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3563     {
3564       if (dump_enabled_p ())
3565         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3566                          "number of versioning for alias "
3567                          "run-time tests exceeds %d "
3568                          "(--param vect-max-version-for-alias-checks)\n",
3569                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3570       return false;
3571     }
3572
3573   return true;
3574 }
3575
3576 /* Check whether we can use an internal function for a gather load
3577    or scatter store.  READ_P is true for loads and false for stores.
3578    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3579    the type of the memory elements being loaded or stored.  OFFSET_BITS
3580    is the number of bits in each scalar offset and OFFSET_SIGN is the
3581    sign of the offset.  SCALE is the amount by which the offset should
3582    be multiplied *after* it has been converted to address width.
3583
3584    Return true if the function is supported, storing the function
3585    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3586
3587 bool
3588 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3589                           tree memory_type, unsigned int offset_bits,
3590                           signop offset_sign, int scale,
3591                           internal_fn *ifn_out, tree *element_type_out)
3592 {
3593   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3594   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3595   if (offset_bits > element_bits)
3596     /* Internal functions require the offset to be the same width as
3597        the vector elements.  We can extend narrower offsets, but it isn't
3598        safe to truncate wider offsets.  */
3599     return false;
3600
3601   if (element_bits != memory_bits)
3602     /* For now the vector elements must be the same width as the
3603        memory elements.  */
3604     return false;
3605
3606   /* Work out which function we need.  */
3607   internal_fn ifn;
3608   if (read_p)
3609     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3610   else
3611     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3612
3613   /* Test whether the target supports this combination.  */
3614   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3615                                                offset_sign, scale))
3616     return false;
3617
3618   *ifn_out = ifn;
3619   *element_type_out = TREE_TYPE (vectype);
3620   return true;
3621 }
3622
3623 /* STMT_INFO is a call to an internal gather load or scatter store function.
3624    Describe the operation in INFO.  */
3625
3626 static void
3627 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3628                                    gather_scatter_info *info)
3629 {
3630   gcall *call = as_a <gcall *> (stmt_info->stmt);
3631   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3632   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3633
3634   info->ifn = gimple_call_internal_fn (call);
3635   info->decl = NULL_TREE;
3636   info->base = gimple_call_arg (call, 0);
3637   info->offset = gimple_call_arg (call, 1);
3638   info->offset_dt = vect_unknown_def_type;
3639   info->offset_vectype = NULL_TREE;
3640   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3641   info->element_type = TREE_TYPE (vectype);
3642   info->memory_type = TREE_TYPE (DR_REF (dr));
3643 }
3644
3645 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3646    gather load or scatter store.  Describe the operation in *INFO if so.  */
3647
3648 bool
3649 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3650                            gather_scatter_info *info)
3651 {
3652   HOST_WIDE_INT scale = 1;
3653   poly_int64 pbitpos, pbitsize;
3654   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3655   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3656   tree offtype = NULL_TREE;
3657   tree decl = NULL_TREE, base, off;
3658   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3659   tree memory_type = TREE_TYPE (DR_REF (dr));
3660   machine_mode pmode;
3661   int punsignedp, reversep, pvolatilep = 0;
3662   internal_fn ifn;
3663   tree element_type;
3664   bool masked_p = false;
3665
3666   /* See whether this is already a call to a gather/scatter internal function.
3667      If not, see whether it's a masked load or store.  */
3668   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3669   if (call && gimple_call_internal_p (call))
3670     {
3671       ifn = gimple_call_internal_fn (call);
3672       if (internal_gather_scatter_fn_p (ifn))
3673         {
3674           vect_describe_gather_scatter_call (stmt_info, info);
3675           return true;
3676         }
3677       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3678     }
3679
3680   /* True if we should aim to use internal functions rather than
3681      built-in functions.  */
3682   bool use_ifn_p = (DR_IS_READ (dr)
3683                     ? supports_vec_gather_load_p ()
3684                     : supports_vec_scatter_store_p ());
3685
3686   base = DR_REF (dr);
3687   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3688      see if we can use the def stmt of the address.  */
3689   if (masked_p
3690       && TREE_CODE (base) == MEM_REF
3691       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3692       && integer_zerop (TREE_OPERAND (base, 1))
3693       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3694     {
3695       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3696       if (is_gimple_assign (def_stmt)
3697           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3698         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3699     }
3700
3701   /* The gather and scatter builtins need address of the form
3702      loop_invariant + vector * {1, 2, 4, 8}
3703      or
3704      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3705      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3706      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3707      multiplications and additions in it.  To get a vector, we need
3708      a single SSA_NAME that will be defined in the loop and will
3709      contain everything that is not loop invariant and that can be
3710      vectorized.  The following code attempts to find such a preexistng
3711      SSA_NAME OFF and put the loop invariants into a tree BASE
3712      that can be gimplified before the loop.  */
3713   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3714                               &punsignedp, &reversep, &pvolatilep);
3715   if (reversep)
3716     return false;
3717
3718   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3719
3720   if (TREE_CODE (base) == MEM_REF)
3721     {
3722       if (!integer_zerop (TREE_OPERAND (base, 1)))
3723         {
3724           if (off == NULL_TREE)
3725             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3726           else
3727             off = size_binop (PLUS_EXPR, off,
3728                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3729         }
3730       base = TREE_OPERAND (base, 0);
3731     }
3732   else
3733     base = build_fold_addr_expr (base);
3734
3735   if (off == NULL_TREE)
3736     off = size_zero_node;
3737
3738   /* If base is not loop invariant, either off is 0, then we start with just
3739      the constant offset in the loop invariant BASE and continue with base
3740      as OFF, otherwise give up.
3741      We could handle that case by gimplifying the addition of base + off
3742      into some SSA_NAME and use that as off, but for now punt.  */
3743   if (!expr_invariant_in_loop_p (loop, base))
3744     {
3745       if (!integer_zerop (off))
3746         return false;
3747       off = base;
3748       base = size_int (pbytepos);
3749     }
3750   /* Otherwise put base + constant offset into the loop invariant BASE
3751      and continue with OFF.  */
3752   else
3753     {
3754       base = fold_convert (sizetype, base);
3755       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3756     }
3757
3758   /* OFF at this point may be either a SSA_NAME or some tree expression
3759      from get_inner_reference.  Try to peel off loop invariants from it
3760      into BASE as long as possible.  */
3761   STRIP_NOPS (off);
3762   while (offtype == NULL_TREE)
3763     {
3764       enum tree_code code;
3765       tree op0, op1, add = NULL_TREE;
3766
3767       if (TREE_CODE (off) == SSA_NAME)
3768         {
3769           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3770
3771           if (expr_invariant_in_loop_p (loop, off))
3772             return false;
3773
3774           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3775             break;
3776
3777           op0 = gimple_assign_rhs1 (def_stmt);
3778           code = gimple_assign_rhs_code (def_stmt);
3779           op1 = gimple_assign_rhs2 (def_stmt);
3780         }
3781       else
3782         {
3783           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3784             return false;
3785           code = TREE_CODE (off);
3786           extract_ops_from_tree (off, &code, &op0, &op1);
3787         }
3788       switch (code)
3789         {
3790         case POINTER_PLUS_EXPR:
3791         case PLUS_EXPR:
3792           if (expr_invariant_in_loop_p (loop, op0))
3793             {
3794               add = op0;
3795               off = op1;
3796             do_add:
3797               add = fold_convert (sizetype, add);
3798               if (scale != 1)
3799                 add = size_binop (MULT_EXPR, add, size_int (scale));
3800               base = size_binop (PLUS_EXPR, base, add);
3801               continue;
3802             }
3803           if (expr_invariant_in_loop_p (loop, op1))
3804             {
3805               add = op1;
3806               off = op0;
3807               goto do_add;
3808             }
3809           break;
3810         case MINUS_EXPR:
3811           if (expr_invariant_in_loop_p (loop, op1))
3812             {
3813               add = fold_convert (sizetype, op1);
3814               add = size_binop (MINUS_EXPR, size_zero_node, add);
3815               off = op0;
3816               goto do_add;
3817             }
3818           break;
3819         case MULT_EXPR:
3820           if (scale == 1 && tree_fits_shwi_p (op1))
3821             {
3822               int new_scale = tree_to_shwi (op1);
3823               /* Only treat this as a scaling operation if the target
3824                  supports it.  */
3825               if (use_ifn_p
3826                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3827                                                 vectype, memory_type, 1,
3828                                                 TYPE_SIGN (TREE_TYPE (op0)),
3829                                                 new_scale, &ifn,
3830                                                 &element_type))
3831                 break;
3832               scale = new_scale;
3833               off = op0;
3834               continue;
3835             }
3836           break;
3837         case SSA_NAME:
3838           off = op0;
3839           continue;
3840         CASE_CONVERT:
3841           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3842               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3843             break;
3844           if (TYPE_PRECISION (TREE_TYPE (op0))
3845               == TYPE_PRECISION (TREE_TYPE (off)))
3846             {
3847               off = op0;
3848               continue;
3849             }
3850
3851           /* The internal functions need the offset to be the same width
3852              as the elements of VECTYPE.  Don't include operations that
3853              cast the offset from that width to a different width.  */
3854           if (use_ifn_p
3855               && (int_size_in_bytes (TREE_TYPE (vectype))
3856                   == int_size_in_bytes (TREE_TYPE (off))))
3857             break;
3858
3859           if (TYPE_PRECISION (TREE_TYPE (op0))
3860               < TYPE_PRECISION (TREE_TYPE (off)))
3861             {
3862               off = op0;
3863               offtype = TREE_TYPE (off);
3864               STRIP_NOPS (off);
3865               continue;
3866             }
3867           break;
3868         default:
3869           break;
3870         }
3871       break;
3872     }
3873
3874   /* If at the end OFF still isn't a SSA_NAME or isn't
3875      defined in the loop, punt.  */
3876   if (TREE_CODE (off) != SSA_NAME
3877       || expr_invariant_in_loop_p (loop, off))
3878     return false;
3879
3880   if (offtype == NULL_TREE)
3881     offtype = TREE_TYPE (off);
3882
3883   if (use_ifn_p)
3884     {
3885       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3886                                      memory_type, TYPE_PRECISION (offtype),
3887                                      TYPE_SIGN (offtype), scale, &ifn,
3888                                      &element_type))
3889         return false;
3890     }
3891   else
3892     {
3893       if (DR_IS_READ (dr))
3894         {
3895           if (targetm.vectorize.builtin_gather)
3896             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3897         }
3898       else
3899         {
3900           if (targetm.vectorize.builtin_scatter)
3901             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3902         }
3903
3904       if (!decl)
3905         return false;
3906
3907       ifn = IFN_LAST;
3908       element_type = TREE_TYPE (vectype);
3909     }
3910
3911   info->ifn = ifn;
3912   info->decl = decl;
3913   info->base = base;
3914   info->offset = off;
3915   info->offset_dt = vect_unknown_def_type;
3916   info->offset_vectype = NULL_TREE;
3917   info->scale = scale;
3918   info->element_type = element_type;
3919   info->memory_type = memory_type;
3920   return true;
3921 }
3922
3923 /* Find the data references in STMT, analyze them with respect to LOOP and
3924    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3925    be handled.  */
3926
3927 bool
3928 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3929                                vec<data_reference_p> *datarefs)
3930 {
3931   /* We can ignore clobbers for dataref analysis - they are removed during
3932      loop vectorization and BB vectorization checks dependences with a
3933      stmt walk.  */
3934   if (gimple_clobber_p (stmt))
3935     return true;
3936
3937   if (gimple_has_volatile_ops (stmt))
3938     {
3939       if (dump_enabled_p ())
3940         {
3941           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3942                            "not vectorized: volatile type ");
3943           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3944         }
3945       return false;
3946     }
3947
3948   if (stmt_can_throw_internal (stmt))
3949     {
3950       if (dump_enabled_p ())
3951         {
3952           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3953                            "not vectorized: statement can throw an "
3954                            "exception ");
3955           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3956         }
3957       return false;
3958     }
3959
3960   auto_vec<data_reference_p, 2> refs;
3961   if (!find_data_references_in_stmt (loop, stmt, &refs))
3962     return false;
3963
3964   if (refs.is_empty ())
3965     return true;
3966
3967   if (refs.length () > 1)
3968     {
3969       if (dump_enabled_p ())
3970         {
3971           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3972                            "not vectorized: more than one data ref "
3973                            "in stmt: ");
3974           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3975         }
3976       return false;
3977     }
3978
3979   if (gcall *call = dyn_cast <gcall *> (stmt))
3980     if (!gimple_call_internal_p (call)
3981         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3982             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3983       {
3984         if (dump_enabled_p ())
3985           {
3986             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3987                              "not vectorized: dr in a call ");
3988             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3989           }
3990         return false;
3991       }
3992
3993   data_reference_p dr = refs.pop ();
3994   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3995       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3996     {
3997       if (dump_enabled_p ())
3998         {
3999           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4000                            "not vectorized: statement is bitfield "
4001                            "access ");
4002           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4003         }
4004       return false;
4005     }
4006
4007   if (DR_BASE_ADDRESS (dr)
4008       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4009     {
4010       if (dump_enabled_p ())
4011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4012                          "not vectorized: base addr of dr is a "
4013                          "constant\n");
4014       return false;
4015     }
4016
4017   /* Check whether this may be a SIMD lane access and adjust the
4018      DR to make it easier for us to handle it.  */
4019   if (loop
4020       && loop->simduid
4021       && (!DR_BASE_ADDRESS (dr)
4022           || !DR_OFFSET (dr)
4023           || !DR_INIT (dr)
4024           || !DR_STEP (dr)))
4025     {
4026       struct data_reference *newdr
4027         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4028                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4029       if (DR_BASE_ADDRESS (newdr)
4030           && DR_OFFSET (newdr)
4031           && DR_INIT (newdr)
4032           && DR_STEP (newdr)
4033           && integer_zerop (DR_STEP (newdr)))
4034         {
4035           tree off = DR_OFFSET (newdr);
4036           STRIP_NOPS (off);
4037           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4038               && TREE_CODE (off) == MULT_EXPR
4039               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4040             {
4041               tree step = TREE_OPERAND (off, 1);
4042               off = TREE_OPERAND (off, 0);
4043               STRIP_NOPS (off);
4044               if (CONVERT_EXPR_P (off)
4045                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4046                       < TYPE_PRECISION (TREE_TYPE (off))))
4047                 off = TREE_OPERAND (off, 0);
4048               if (TREE_CODE (off) == SSA_NAME)
4049                 {
4050                   gimple *def = SSA_NAME_DEF_STMT (off);
4051                   tree reft = TREE_TYPE (DR_REF (newdr));
4052                   if (is_gimple_call (def)
4053                       && gimple_call_internal_p (def)
4054                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4055                     {
4056                       tree arg = gimple_call_arg (def, 0);
4057                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4058                       arg = SSA_NAME_VAR (arg);
4059                       if (arg == loop->simduid
4060                           /* For now.  */
4061                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4062                         {
4063                           DR_OFFSET (newdr) = ssize_int (0);
4064                           DR_STEP (newdr) = step;
4065                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4066                           DR_STEP_ALIGNMENT (newdr)
4067                             = highest_pow2_factor (step);
4068                           /* Mark as simd-lane access.  */
4069                           newdr->aux = (void *)-1;
4070                           free_data_ref (dr);
4071                           datarefs->safe_push (newdr);
4072                           return true;
4073                         }
4074                     }
4075                 }
4076             }
4077         }
4078       free_data_ref (newdr);
4079     }
4080
4081   datarefs->safe_push (dr);
4082   return true;
4083 }
4084
4085 /* Function vect_analyze_data_refs.
4086
4087   Find all the data references in the loop or basic block.
4088
4089    The general structure of the analysis of data refs in the vectorizer is as
4090    follows:
4091    1- vect_analyze_data_refs(loop/bb): call
4092       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4093       in the loop/bb and their dependences.
4094    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4095    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4096    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4097
4098 */
4099
4100 bool
4101 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4102 {
4103   struct loop *loop = NULL;
4104   unsigned int i;
4105   struct data_reference *dr;
4106   tree scalar_type;
4107
4108   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4109
4110   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4111     loop = LOOP_VINFO_LOOP (loop_vinfo);
4112
4113   /* Go through the data-refs, check that the analysis succeeded.  Update
4114      pointer from stmt_vec_info struct to DR and vectype.  */
4115
4116   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4117   FOR_EACH_VEC_ELT (datarefs, i, dr)
4118     {
4119       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4120       poly_uint64 vf;
4121
4122       gcc_assert (DR_REF (dr));
4123       stmt_vec_info stmt_info = vect_dr_stmt (dr);
4124
4125       /* Check that analysis of the data-ref succeeded.  */
4126       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4127           || !DR_STEP (dr))
4128         {
4129           bool maybe_gather
4130             = DR_IS_READ (dr)
4131               && !TREE_THIS_VOLATILE (DR_REF (dr))
4132               && (targetm.vectorize.builtin_gather != NULL
4133                   || supports_vec_gather_load_p ());
4134           bool maybe_scatter
4135             = DR_IS_WRITE (dr)
4136               && !TREE_THIS_VOLATILE (DR_REF (dr))
4137               && (targetm.vectorize.builtin_scatter != NULL
4138                   || supports_vec_scatter_store_p ());
4139
4140           /* If target supports vector gather loads or scatter stores,
4141              see if they can't be used.  */
4142           if (is_a <loop_vec_info> (vinfo)
4143               && !nested_in_vect_loop_p (loop, stmt_info))
4144             {
4145               if (maybe_gather || maybe_scatter)
4146                 {
4147                   if (maybe_gather)
4148                     gatherscatter = GATHER;
4149                   else
4150                     gatherscatter = SCATTER;
4151                 }
4152             }
4153
4154           if (gatherscatter == SG_NONE)
4155             {
4156               if (dump_enabled_p ())
4157                 {
4158                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4159                                    "not vectorized: data ref analysis "
4160                                    "failed ");
4161                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4162                                     stmt_info->stmt, 0);
4163                 }
4164               if (is_a <bb_vec_info> (vinfo))
4165                 {
4166                   /* In BB vectorization the ref can still participate
4167                      in dependence analysis, we just can't vectorize it.  */
4168                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4169                   continue;
4170                 }
4171               return false;
4172             }
4173         }
4174
4175       /* See if this was detected as SIMD lane access.  */
4176       if (dr->aux == (void *)-1)
4177         {
4178           if (nested_in_vect_loop_p (loop, stmt_info))
4179             {
4180               if (dump_enabled_p ())
4181                 {
4182                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183                                    "not vectorized: data ref analysis "
4184                                    "failed ");
4185                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4186                                     stmt_info->stmt, 0);
4187                 }
4188               return false;
4189             }
4190           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4191         }
4192
4193       tree base = get_base_address (DR_REF (dr));
4194       if (base && VAR_P (base) && DECL_NONALIASED (base))
4195         {
4196           if (dump_enabled_p ())
4197             {
4198               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199                                "not vectorized: base object not addressable "
4200                                "for stmt: ");
4201               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4202                                 stmt_info->stmt, 0);
4203             }
4204           if (is_a <bb_vec_info> (vinfo))
4205             {
4206               /* In BB vectorization the ref can still participate
4207                  in dependence analysis, we just can't vectorize it.  */
4208               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209               continue;
4210             }
4211           return false;
4212         }
4213
4214       if (is_a <loop_vec_info> (vinfo)
4215           && DR_STEP (dr)
4216           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4217         {
4218           if (nested_in_vect_loop_p (loop, stmt_info))
4219             {
4220               if (dump_enabled_p ())
4221                 {
4222                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223                                    "not vectorized: not suitable for strided "
4224                                    "load ");
4225                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4226                                     stmt_info->stmt, 0);
4227                 }
4228               return false;
4229             }
4230           STMT_VINFO_STRIDED_P (stmt_info) = true;
4231         }
4232
4233       /* Update DR field in stmt_vec_info struct.  */
4234
4235       /* If the dataref is in an inner-loop of the loop that is considered for
4236          for vectorization, we also want to analyze the access relative to
4237          the outer-loop (DR contains information only relative to the
4238          inner-most enclosing loop).  We do that by building a reference to the
4239          first location accessed by the inner-loop, and analyze it relative to
4240          the outer-loop.  */
4241       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4242         {
4243           /* Build a reference to the first location accessed by the
4244              inner loop: *(BASE + INIT + OFFSET).  By construction,
4245              this address must be invariant in the inner loop, so we
4246              can consider it as being used in the outer loop.  */
4247           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4248           tree offset = unshare_expr (DR_OFFSET (dr));
4249           tree init = unshare_expr (DR_INIT (dr));
4250           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4251                                           init, offset);
4252           tree init_addr = fold_build_pointer_plus (base, init_offset);
4253           tree init_ref = build_fold_indirect_ref (init_addr);
4254
4255           if (dump_enabled_p ())
4256             {
4257               dump_printf_loc (MSG_NOTE, vect_location,
4258                                "analyze in outer loop: ");
4259               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4260               dump_printf (MSG_NOTE, "\n");
4261             }
4262
4263           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4264                                      init_ref, loop))
4265             /* dr_analyze_innermost already explained the failure.  */
4266             return false;
4267
4268           if (dump_enabled_p ())
4269             {
4270               dump_printf_loc (MSG_NOTE, vect_location,
4271                                "\touter base_address: ");
4272               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4273                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4274               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4275               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4276                                  STMT_VINFO_DR_OFFSET (stmt_info));
4277               dump_printf (MSG_NOTE,
4278                            "\n\touter constant offset from base address: ");
4279               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280                                  STMT_VINFO_DR_INIT (stmt_info));
4281               dump_printf (MSG_NOTE, "\n\touter step: ");
4282               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4283                                  STMT_VINFO_DR_STEP (stmt_info));
4284               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4285                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4286               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4287                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4288               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4289                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4290               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4291                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4292             }
4293         }
4294
4295       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4296       STMT_VINFO_DATA_REF (stmt_info) = dr;
4297
4298       /* Set vectype for STMT.  */
4299       scalar_type = TREE_TYPE (DR_REF (dr));
4300       STMT_VINFO_VECTYPE (stmt_info)
4301         = get_vectype_for_scalar_type (scalar_type);
4302       if (!STMT_VINFO_VECTYPE (stmt_info))
4303         {
4304           if (dump_enabled_p ())
4305             {
4306               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4307                                "not vectorized: no vectype for stmt: ");
4308               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4309                                 stmt_info->stmt, 0);
4310               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4311               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4312                                  scalar_type);
4313               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4314             }
4315
4316           if (is_a <bb_vec_info> (vinfo))
4317             {
4318               /* No vector type is fine, the ref can still participate
4319                  in dependence analysis, we just can't vectorize it.  */
4320               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4321               continue;
4322             }
4323           return false;
4324         }
4325       else
4326         {
4327           if (dump_enabled_p ())
4328             {
4329               dump_printf_loc (MSG_NOTE, vect_location,
4330                                "got vectype for stmt: ");
4331               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4332               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4333                                  STMT_VINFO_VECTYPE (stmt_info));
4334               dump_printf (MSG_NOTE, "\n");
4335             }
4336         }
4337
4338       /* Adjust the minimal vectorization factor according to the
4339          vector type.  */
4340       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4341       *min_vf = upper_bound (*min_vf, vf);
4342
4343       if (gatherscatter != SG_NONE)
4344         {
4345           gather_scatter_info gs_info;
4346           if (!vect_check_gather_scatter (stmt_info,
4347                                           as_a <loop_vec_info> (vinfo),
4348                                           &gs_info)
4349               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4350             {
4351               if (dump_enabled_p ())
4352                 {
4353                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4354                                    (gatherscatter == GATHER) ?
4355                                    "not vectorized: not suitable for gather "
4356                                    "load " :
4357                                    "not vectorized: not suitable for scatter "
4358                                    "store ");
4359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4360                                     stmt_info->stmt, 0);
4361                 }
4362               return false;
4363             }
4364           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4365         }
4366     }
4367
4368   /* We used to stop processing and prune the list here.  Verify we no
4369      longer need to.  */
4370   gcc_assert (i == datarefs.length ());
4371
4372   return true;
4373 }
4374
4375
4376 /* Function vect_get_new_vect_var.
4377
4378    Returns a name for a new variable.  The current naming scheme appends the
4379    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4380    the name of vectorizer generated variables, and appends that to NAME if
4381    provided.  */
4382
4383 tree
4384 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4385 {
4386   const char *prefix;
4387   tree new_vect_var;
4388
4389   switch (var_kind)
4390   {
4391   case vect_simple_var:
4392     prefix = "vect";
4393     break;
4394   case vect_scalar_var:
4395     prefix = "stmp";
4396     break;
4397   case vect_mask_var:
4398     prefix = "mask";
4399     break;
4400   case vect_pointer_var:
4401     prefix = "vectp";
4402     break;
4403   default:
4404     gcc_unreachable ();
4405   }
4406
4407   if (name)
4408     {
4409       char* tmp = concat (prefix, "_", name, NULL);
4410       new_vect_var = create_tmp_reg (type, tmp);
4411       free (tmp);
4412     }
4413   else
4414     new_vect_var = create_tmp_reg (type, prefix);
4415
4416   return new_vect_var;
4417 }
4418
4419 /* Like vect_get_new_vect_var but return an SSA name.  */
4420
4421 tree
4422 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4423 {
4424   const char *prefix;
4425   tree new_vect_var;
4426
4427   switch (var_kind)
4428   {
4429   case vect_simple_var:
4430     prefix = "vect";
4431     break;
4432   case vect_scalar_var:
4433     prefix = "stmp";
4434     break;
4435   case vect_pointer_var:
4436     prefix = "vectp";
4437     break;
4438   default:
4439     gcc_unreachable ();
4440   }
4441
4442   if (name)
4443     {
4444       char* tmp = concat (prefix, "_", name, NULL);
4445       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4446       free (tmp);
4447     }
4448   else
4449     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4450
4451   return new_vect_var;
4452 }
4453
4454 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4455
4456 static void
4457 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4458 {
4459   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4460   int misalign = DR_MISALIGNMENT (dr);
4461   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4462     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4463   else
4464     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4465                             DR_TARGET_ALIGNMENT (dr), misalign);
4466 }
4467
4468 /* Function vect_create_addr_base_for_vector_ref.
4469
4470    Create an expression that computes the address of the first memory location
4471    that will be accessed for a data reference.
4472
4473    Input:
4474    STMT_INFO: The statement containing the data reference.
4475    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4476    OFFSET: Optional. If supplied, it is be added to the initial address.
4477    LOOP:    Specify relative to which loop-nest should the address be computed.
4478             For example, when the dataref is in an inner-loop nested in an
4479             outer-loop that is now being vectorized, LOOP can be either the
4480             outer-loop, or the inner-loop.  The first memory location accessed
4481             by the following dataref ('in' points to short):
4482
4483                 for (i=0; i<N; i++)
4484                    for (j=0; j<M; j++)
4485                      s += in[i+j]
4486
4487             is as follows:
4488             if LOOP=i_loop:     &in             (relative to i_loop)
4489             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4490    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4491             initial address.  Unlike OFFSET, which is number of elements to
4492             be added, BYTE_OFFSET is measured in bytes.
4493
4494    Output:
4495    1. Return an SSA_NAME whose value is the address of the memory location of
4496       the first vector of the data reference.
4497    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4498       these statement(s) which define the returned SSA_NAME.
4499
4500    FORNOW: We are only handling array accesses with step 1.  */
4501
4502 tree
4503 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4504                                       gimple_seq *new_stmt_list,
4505                                       tree offset,
4506                                       tree byte_offset)
4507 {
4508   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4509   const char *base_name;
4510   tree addr_base;
4511   tree dest;
4512   gimple_seq seq = NULL;
4513   tree vect_ptr_type;
4514   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4515   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4516   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4517
4518   tree data_ref_base = unshare_expr (drb->base_address);
4519   tree base_offset = unshare_expr (drb->offset);
4520   tree init = unshare_expr (drb->init);
4521
4522   if (loop_vinfo)
4523     base_name = get_name (data_ref_base);
4524   else
4525     {
4526       base_offset = ssize_int (0);
4527       init = ssize_int (0);
4528       base_name = get_name (DR_REF (dr));
4529     }
4530
4531   /* Create base_offset */
4532   base_offset = size_binop (PLUS_EXPR,
4533                             fold_convert (sizetype, base_offset),
4534                             fold_convert (sizetype, init));
4535
4536   if (offset)
4537     {
4538       offset = fold_build2 (MULT_EXPR, sizetype,
4539                             fold_convert (sizetype, offset), step);
4540       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4541                                  base_offset, offset);
4542     }
4543   if (byte_offset)
4544     {
4545       byte_offset = fold_convert (sizetype, byte_offset);
4546       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4547                                  base_offset, byte_offset);
4548     }
4549
4550   /* base + base_offset */
4551   if (loop_vinfo)
4552     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4553   else
4554     {
4555       addr_base = build1 (ADDR_EXPR,
4556                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4557                           unshare_expr (DR_REF (dr)));
4558     }
4559
4560   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4561   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4562   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4563   gimple_seq_add_seq (new_stmt_list, seq);
4564
4565   if (DR_PTR_INFO (dr)
4566       && TREE_CODE (addr_base) == SSA_NAME
4567       && !SSA_NAME_PTR_INFO (addr_base))
4568     {
4569       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4570       if (offset || byte_offset)
4571         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4572     }
4573
4574   if (dump_enabled_p ())
4575     {
4576       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4577       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4578       dump_printf (MSG_NOTE, "\n");
4579     }
4580
4581   return addr_base;
4582 }
4583
4584
4585 /* Function vect_create_data_ref_ptr.
4586
4587    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4588    location accessed in the loop by STMT_INFO, along with the def-use update
4589    chain to appropriately advance the pointer through the loop iterations.
4590    Also set aliasing information for the pointer.  This pointer is used by
4591    the callers to this function to create a memory reference expression for
4592    vector load/store access.
4593
4594    Input:
4595    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4596          GIMPLE_ASSIGN <name, data-ref> or
4597          GIMPLE_ASSIGN <data-ref, name>.
4598    2. AGGR_TYPE: the type of the reference, which should be either a vector
4599         or an array.
4600    3. AT_LOOP: the loop where the vector memref is to be created.
4601    4. OFFSET (optional): an offset to be added to the initial address accessed
4602         by the data-ref in STMT_INFO.
4603    5. BSI: location where the new stmts are to be placed if there is no loop
4604    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4605         pointing to the initial address.
4606    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4607         to the initial address accessed by the data-ref in STMT_INFO.  This is
4608         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4609         in bytes.
4610    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4611         to the IV during each iteration of the loop.  NULL says to move
4612         by one copy of AGGR_TYPE up or down, depending on the step of the
4613         data reference.
4614
4615    Output:
4616    1. Declare a new ptr to vector_type, and have it point to the base of the
4617       data reference (initial addressed accessed by the data reference).
4618       For example, for vector of type V8HI, the following code is generated:
4619
4620       v8hi *ap;
4621       ap = (v8hi *)initial_address;
4622
4623       if OFFSET is not supplied:
4624          initial_address = &a[init];
4625       if OFFSET is supplied:
4626          initial_address = &a[init + OFFSET];
4627       if BYTE_OFFSET is supplied:
4628          initial_address = &a[init] + BYTE_OFFSET;
4629
4630       Return the initial_address in INITIAL_ADDRESS.
4631
4632    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4633       update the pointer in each iteration of the loop.
4634
4635       Return the increment stmt that updates the pointer in PTR_INCR.
4636
4637    3. Set INV_P to true if the access pattern of the data reference in the
4638       vectorized loop is invariant.  Set it to false otherwise.
4639
4640    4. Return the pointer.  */
4641
4642 tree
4643 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4644                           struct loop *at_loop, tree offset,
4645                           tree *initial_address, gimple_stmt_iterator *gsi,
4646                           gimple **ptr_incr, bool only_init, bool *inv_p,
4647                           tree byte_offset, tree iv_step)
4648 {
4649   const char *base_name;
4650   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4651   struct loop *loop = NULL;
4652   bool nested_in_vect_loop = false;
4653   struct loop *containing_loop = NULL;
4654   tree aggr_ptr_type;
4655   tree aggr_ptr;
4656   tree new_temp;
4657   gimple_seq new_stmt_list = NULL;
4658   edge pe = NULL;
4659   basic_block new_bb;
4660   tree aggr_ptr_init;
4661   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4662   tree aptr;
4663   gimple_stmt_iterator incr_gsi;
4664   bool insert_after;
4665   tree indx_before_incr, indx_after_incr;
4666   gimple *incr;
4667   tree step;
4668   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4669
4670   gcc_assert (iv_step != NULL_TREE
4671               || TREE_CODE (aggr_type) == ARRAY_TYPE
4672               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4673
4674   if (loop_vinfo)
4675     {
4676       loop = LOOP_VINFO_LOOP (loop_vinfo);
4677       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4678       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4679       pe = loop_preheader_edge (loop);
4680     }
4681   else
4682     {
4683       gcc_assert (bb_vinfo);
4684       only_init = true;
4685       *ptr_incr = NULL;
4686     }
4687
4688   /* Check the step (evolution) of the load in LOOP, and record
4689      whether it's invariant.  */
4690   step = vect_dr_behavior (dr)->step;
4691   if (integer_zerop (step))
4692     *inv_p = true;
4693   else
4694     *inv_p = false;
4695
4696   /* Create an expression for the first address accessed by this load
4697      in LOOP.  */
4698   base_name = get_name (DR_BASE_ADDRESS (dr));
4699
4700   if (dump_enabled_p ())
4701     {
4702       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4703       dump_printf_loc (MSG_NOTE, vect_location,
4704                        "create %s-pointer variable to type: ",
4705                        get_tree_code_name (TREE_CODE (aggr_type)));
4706       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4707       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4708         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4709       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4710         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4711       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4712         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4713       else
4714         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4715       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4716       dump_printf (MSG_NOTE, "\n");
4717     }
4718
4719   /* (1) Create the new aggregate-pointer variable.
4720      Vector and array types inherit the alias set of their component
4721      type by default so we need to use a ref-all pointer if the data
4722      reference does not conflict with the created aggregated data
4723      reference because it is not addressable.  */
4724   bool need_ref_all = false;
4725   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4726                               get_alias_set (DR_REF (dr))))
4727     need_ref_all = true;
4728   /* Likewise for any of the data references in the stmt group.  */
4729   else if (DR_GROUP_SIZE (stmt_info) > 1)
4730     {
4731       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4732       do
4733         {
4734           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4735           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4736                                       get_alias_set (DR_REF (sdr))))
4737             {
4738               need_ref_all = true;
4739               break;
4740             }
4741           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4742         }
4743       while (sinfo);
4744     }
4745   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4746                                                need_ref_all);
4747   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4748
4749
4750   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4751      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4752      def-use update cycles for the pointer: one relative to the outer-loop
4753      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4754      to the inner-loop (which is the inner-most loop containing the dataref),
4755      and this is done be step (5) below.
4756
4757      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4758      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4759      redundant.  Steps (3),(4) create the following:
4760
4761         vp0 = &base_addr;
4762         LOOP:   vp1 = phi(vp0,vp2)
4763                 ...
4764                 ...
4765                 vp2 = vp1 + step
4766                 goto LOOP
4767
4768      If there is an inner-loop nested in loop, then step (5) will also be
4769      applied, and an additional update in the inner-loop will be created:
4770
4771         vp0 = &base_addr;
4772         LOOP:   vp1 = phi(vp0,vp2)
4773                 ...
4774         inner:     vp3 = phi(vp1,vp4)
4775                    vp4 = vp3 + inner_step
4776                    if () goto inner
4777                 ...
4778                 vp2 = vp1 + step
4779                 if () goto LOOP   */
4780
4781   /* (2) Calculate the initial address of the aggregate-pointer, and set
4782      the aggregate-pointer to point to it before the loop.  */
4783
4784   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4785
4786   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4787                                                    offset, byte_offset);
4788   if (new_stmt_list)
4789     {
4790       if (pe)
4791         {
4792           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4793           gcc_assert (!new_bb);
4794         }
4795       else
4796         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4797     }
4798
4799   *initial_address = new_temp;
4800   aggr_ptr_init = new_temp;
4801
4802   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4803      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4804      inner-loop nested in LOOP (during outer-loop vectorization).  */
4805
4806   /* No update in loop is required.  */
4807   if (only_init && (!loop_vinfo || at_loop == loop))
4808     aptr = aggr_ptr_init;
4809   else
4810     {
4811       if (iv_step == NULL_TREE)
4812         {
4813           /* The step of the aggregate pointer is the type size.  */
4814           iv_step = TYPE_SIZE_UNIT (aggr_type);
4815           /* One exception to the above is when the scalar step of the load in
4816              LOOP is zero. In this case the step here is also zero.  */
4817           if (*inv_p)
4818             iv_step = size_zero_node;
4819           else if (tree_int_cst_sgn (step) == -1)
4820             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4821         }
4822
4823       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4824
4825       create_iv (aggr_ptr_init,
4826                  fold_convert (aggr_ptr_type, iv_step),
4827                  aggr_ptr, loop, &incr_gsi, insert_after,
4828                  &indx_before_incr, &indx_after_incr);
4829       incr = gsi_stmt (incr_gsi);
4830       loop_vinfo->add_stmt (incr);
4831
4832       /* Copy the points-to information if it exists. */
4833       if (DR_PTR_INFO (dr))
4834         {
4835           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4836           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4837         }
4838       if (ptr_incr)
4839         *ptr_incr = incr;
4840
4841       aptr = indx_before_incr;
4842     }
4843
4844   if (!nested_in_vect_loop || only_init)
4845     return aptr;
4846
4847
4848   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4849      nested in LOOP, if exists.  */
4850
4851   gcc_assert (nested_in_vect_loop);
4852   if (!only_init)
4853     {
4854       standard_iv_increment_position (containing_loop, &incr_gsi,
4855                                       &insert_after);
4856       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4857                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4858                  &indx_after_incr);
4859       incr = gsi_stmt (incr_gsi);
4860       loop_vinfo->add_stmt (incr);
4861
4862       /* Copy the points-to information if it exists. */
4863       if (DR_PTR_INFO (dr))
4864         {
4865           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4866           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4867         }
4868       if (ptr_incr)
4869         *ptr_incr = incr;
4870
4871       return indx_before_incr;
4872     }
4873   else
4874     gcc_unreachable ();
4875 }
4876
4877
4878 /* Function bump_vector_ptr
4879
4880    Increment a pointer (to a vector type) by vector-size. If requested,
4881    i.e. if PTR-INCR is given, then also connect the new increment stmt
4882    to the existing def-use update-chain of the pointer, by modifying
4883    the PTR_INCR as illustrated below:
4884
4885    The pointer def-use update-chain before this function:
4886                         DATAREF_PTR = phi (p_0, p_2)
4887                         ....
4888         PTR_INCR:       p_2 = DATAREF_PTR + step
4889
4890    The pointer def-use update-chain after this function:
4891                         DATAREF_PTR = phi (p_0, p_2)
4892                         ....
4893                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4894                         ....
4895         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4896
4897    Input:
4898    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4899                  in the loop.
4900    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4901               the loop.  The increment amount across iterations is expected
4902               to be vector_size.
4903    BSI - location where the new update stmt is to be placed.
4904    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4905    BUMP - optional. The offset by which to bump the pointer. If not given,
4906           the offset is assumed to be vector_size.
4907
4908    Output: Return NEW_DATAREF_PTR as illustrated above.
4909
4910 */
4911
4912 tree
4913 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4914                  stmt_vec_info stmt_info, tree bump)
4915 {
4916   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4917   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4918   tree update = TYPE_SIZE_UNIT (vectype);
4919   gassign *incr_stmt;
4920   ssa_op_iter iter;
4921   use_operand_p use_p;
4922   tree new_dataref_ptr;
4923
4924   if (bump)
4925     update = bump;
4926
4927   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4928     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4929   else
4930     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4931   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4932                                    dataref_ptr, update);
4933   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4934
4935   /* Copy the points-to information if it exists. */
4936   if (DR_PTR_INFO (dr))
4937     {
4938       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4939       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4940     }
4941
4942   if (!ptr_incr)
4943     return new_dataref_ptr;
4944
4945   /* Update the vector-pointer's cross-iteration increment.  */
4946   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4947     {
4948       tree use = USE_FROM_PTR (use_p);
4949
4950       if (use == dataref_ptr)
4951         SET_USE (use_p, new_dataref_ptr);
4952       else
4953         gcc_assert (operand_equal_p (use, update, 0));
4954     }
4955
4956   return new_dataref_ptr;
4957 }
4958
4959
4960 /* Copy memory reference info such as base/clique from the SRC reference
4961    to the DEST MEM_REF.  */
4962
4963 void
4964 vect_copy_ref_info (tree dest, tree src)
4965 {
4966   if (TREE_CODE (dest) != MEM_REF)
4967     return;
4968
4969   tree src_base = src;
4970   while (handled_component_p (src_base))
4971     src_base = TREE_OPERAND (src_base, 0);
4972   if (TREE_CODE (src_base) != MEM_REF
4973       && TREE_CODE (src_base) != TARGET_MEM_REF)
4974     return;
4975
4976   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4977   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4978 }
4979
4980
4981 /* Function vect_create_destination_var.
4982
4983    Create a new temporary of type VECTYPE.  */
4984
4985 tree
4986 vect_create_destination_var (tree scalar_dest, tree vectype)
4987 {
4988   tree vec_dest;
4989   const char *name;
4990   char *new_name;
4991   tree type;
4992   enum vect_var_kind kind;
4993
4994   kind = vectype
4995     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4996     ? vect_mask_var
4997     : vect_simple_var
4998     : vect_scalar_var;
4999   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5000
5001   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5002
5003   name = get_name (scalar_dest);
5004   if (name)
5005     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5006   else
5007     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5008   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5009   free (new_name);
5010
5011   return vec_dest;
5012 }
5013
5014 /* Function vect_grouped_store_supported.
5015
5016    Returns TRUE if interleave high and interleave low permutations
5017    are supported, and FALSE otherwise.  */
5018
5019 bool
5020 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5021 {
5022   machine_mode mode = TYPE_MODE (vectype);
5023
5024   /* vect_permute_store_chain requires the group size to be equal to 3 or
5025      be a power of two.  */
5026   if (count != 3 && exact_log2 (count) == -1)
5027     {
5028       if (dump_enabled_p ())
5029         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5030                          "the size of the group of accesses"
5031                          " is not a power of 2 or not eqaul to 3\n");
5032       return false;
5033     }
5034
5035   /* Check that the permutation is supported.  */
5036   if (VECTOR_MODE_P (mode))
5037     {
5038       unsigned int i;
5039       if (count == 3)
5040         {
5041           unsigned int j0 = 0, j1 = 0, j2 = 0;
5042           unsigned int i, j;
5043
5044           unsigned int nelt;
5045           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5046             {
5047               if (dump_enabled_p ())
5048                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5049                                  "cannot handle groups of 3 stores for"
5050                                  " variable-length vectors\n");
5051               return false;
5052             }
5053
5054           vec_perm_builder sel (nelt, nelt, 1);
5055           sel.quick_grow (nelt);
5056           vec_perm_indices indices;
5057           for (j = 0; j < 3; j++)
5058             {
5059               int nelt0 = ((3 - j) * nelt) % 3;
5060               int nelt1 = ((3 - j) * nelt + 1) % 3;
5061               int nelt2 = ((3 - j) * nelt + 2) % 3;
5062               for (i = 0; i < nelt; i++)
5063                 {
5064                   if (3 * i + nelt0 < nelt)
5065                     sel[3 * i + nelt0] = j0++;
5066                   if (3 * i + nelt1 < nelt)
5067                     sel[3 * i + nelt1] = nelt + j1++;
5068                   if (3 * i + nelt2 < nelt)
5069                     sel[3 * i + nelt2] = 0;
5070                 }
5071               indices.new_vector (sel, 2, nelt);
5072               if (!can_vec_perm_const_p (mode, indices))
5073                 {
5074                   if (dump_enabled_p ())
5075                     dump_printf (MSG_MISSED_OPTIMIZATION,
5076                                  "permutation op not supported by target.\n");
5077                   return false;
5078                 }
5079
5080               for (i = 0; i < nelt; i++)
5081                 {
5082                   if (3 * i + nelt0 < nelt)
5083                     sel[3 * i + nelt0] = 3 * i + nelt0;
5084                   if (3 * i + nelt1 < nelt)
5085                     sel[3 * i + nelt1] = 3 * i + nelt1;
5086                   if (3 * i + nelt2 < nelt)
5087                     sel[3 * i + nelt2] = nelt + j2++;
5088                 }
5089               indices.new_vector (sel, 2, nelt);
5090               if (!can_vec_perm_const_p (mode, indices))
5091                 {
5092                   if (dump_enabled_p ())
5093                     dump_printf (MSG_MISSED_OPTIMIZATION,
5094                                  "permutation op not supported by target.\n");
5095                   return false;
5096                 }
5097             }
5098           return true;
5099         }
5100       else
5101         {
5102           /* If length is not equal to 3 then only power of 2 is supported.  */
5103           gcc_assert (pow2p_hwi (count));
5104           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5105
5106           /* The encoding has 2 interleaved stepped patterns.  */
5107           vec_perm_builder sel (nelt, 2, 3);
5108           sel.quick_grow (6);
5109           for (i = 0; i < 3; i++)
5110             {
5111               sel[i * 2] = i;
5112               sel[i * 2 + 1] = i + nelt;
5113             }
5114           vec_perm_indices indices (sel, 2, nelt);
5115           if (can_vec_perm_const_p (mode, indices))
5116             {
5117               for (i = 0; i < 6; i++)
5118                 sel[i] += exact_div (nelt, 2);
5119               indices.new_vector (sel, 2, nelt);
5120               if (can_vec_perm_const_p (mode, indices))
5121                 return true;
5122             }
5123         }
5124     }
5125
5126   if (dump_enabled_p ())
5127     dump_printf (MSG_MISSED_OPTIMIZATION,
5128                  "permutaion op not supported by target.\n");
5129   return false;
5130 }
5131
5132
5133 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5134    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5135
5136 bool
5137 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5138                             bool masked_p)
5139 {
5140   if (masked_p)
5141     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5142                                          vec_mask_store_lanes_optab,
5143                                          vectype, count);
5144   else
5145     return vect_lanes_optab_supported_p ("vec_store_lanes",
5146                                          vec_store_lanes_optab,
5147                                          vectype, count);
5148 }
5149
5150
5151 /* Function vect_permute_store_chain.
5152
5153    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5154    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5155    the data correctly for the stores.  Return the final references for stores
5156    in RESULT_CHAIN.
5157
5158    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5159    The input is 4 vectors each containing 8 elements.  We assign a number to
5160    each element, the input sequence is:
5161
5162    1st vec:   0  1  2  3  4  5  6  7
5163    2nd vec:   8  9 10 11 12 13 14 15
5164    3rd vec:  16 17 18 19 20 21 22 23
5165    4th vec:  24 25 26 27 28 29 30 31
5166
5167    The output sequence should be:
5168
5169    1st vec:  0  8 16 24  1  9 17 25
5170    2nd vec:  2 10 18 26  3 11 19 27
5171    3rd vec:  4 12 20 28  5 13 21 30
5172    4th vec:  6 14 22 30  7 15 23 31
5173
5174    i.e., we interleave the contents of the four vectors in their order.
5175
5176    We use interleave_high/low instructions to create such output.  The input of
5177    each interleave_high/low operation is two vectors:
5178    1st vec    2nd vec
5179    0 1 2 3    4 5 6 7
5180    the even elements of the result vector are obtained left-to-right from the
5181    high/low elements of the first vector.  The odd elements of the result are
5182    obtained left-to-right from the high/low elements of the second vector.
5183    The output of interleave_high will be:   0 4 1 5
5184    and of interleave_low:                   2 6 3 7
5185
5186
5187    The permutation is done in log LENGTH stages.  In each stage interleave_high
5188    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5189    where the first argument is taken from the first half of DR_CHAIN and the
5190    second argument from it's second half.
5191    In our example,
5192
5193    I1: interleave_high (1st vec, 3rd vec)
5194    I2: interleave_low (1st vec, 3rd vec)
5195    I3: interleave_high (2nd vec, 4th vec)
5196    I4: interleave_low (2nd vec, 4th vec)
5197
5198    The output for the first stage is:
5199
5200    I1:  0 16  1 17  2 18  3 19
5201    I2:  4 20  5 21  6 22  7 23
5202    I3:  8 24  9 25 10 26 11 27
5203    I4: 12 28 13 29 14 30 15 31
5204
5205    The output of the second stage, i.e. the final result is:
5206
5207    I1:  0  8 16 24  1  9 17 25
5208    I2:  2 10 18 26  3 11 19 27
5209    I3:  4 12 20 28  5 13 21 30
5210    I4:  6 14 22 30  7 15 23 31.  */
5211
5212 void
5213 vect_permute_store_chain (vec<tree> dr_chain,
5214                           unsigned int length,
5215                           stmt_vec_info stmt_info,
5216                           gimple_stmt_iterator *gsi,
5217                           vec<tree> *result_chain)
5218 {
5219   tree vect1, vect2, high, low;
5220   gimple *perm_stmt;
5221   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5222   tree perm_mask_low, perm_mask_high;
5223   tree data_ref;
5224   tree perm3_mask_low, perm3_mask_high;
5225   unsigned int i, j, n, log_length = exact_log2 (length);
5226
5227   result_chain->quick_grow (length);
5228   memcpy (result_chain->address (), dr_chain.address (),
5229           length * sizeof (tree));
5230
5231   if (length == 3)
5232     {
5233       /* vect_grouped_store_supported ensures that this is constant.  */
5234       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5235       unsigned int j0 = 0, j1 = 0, j2 = 0;
5236
5237       vec_perm_builder sel (nelt, nelt, 1);
5238       sel.quick_grow (nelt);
5239       vec_perm_indices indices;
5240       for (j = 0; j < 3; j++)
5241         {
5242           int nelt0 = ((3 - j) * nelt) % 3;
5243           int nelt1 = ((3 - j) * nelt + 1) % 3;
5244           int nelt2 = ((3 - j) * nelt + 2) % 3;
5245
5246           for (i = 0; i < nelt; i++)
5247             {
5248               if (3 * i + nelt0 < nelt)
5249                 sel[3 * i + nelt0] = j0++;
5250               if (3 * i + nelt1 < nelt)
5251                 sel[3 * i + nelt1] = nelt + j1++;
5252               if (3 * i + nelt2 < nelt)
5253                 sel[3 * i + nelt2] = 0;
5254             }
5255           indices.new_vector (sel, 2, nelt);
5256           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5257
5258           for (i = 0; i < nelt; i++)
5259             {
5260               if (3 * i + nelt0 < nelt)
5261                 sel[3 * i + nelt0] = 3 * i + nelt0;
5262               if (3 * i + nelt1 < nelt)
5263                 sel[3 * i + nelt1] = 3 * i + nelt1;
5264               if (3 * i + nelt2 < nelt)
5265                 sel[3 * i + nelt2] = nelt + j2++;
5266             }
5267           indices.new_vector (sel, 2, nelt);
5268           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5269
5270           vect1 = dr_chain[0];
5271           vect2 = dr_chain[1];
5272
5273           /* Create interleaving stmt:
5274              low = VEC_PERM_EXPR <vect1, vect2,
5275                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5276                                    j + 2, nelt + j + 2, *, ...}>  */
5277           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5278           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5279                                            vect2, perm3_mask_low);
5280           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5281
5282           vect1 = data_ref;
5283           vect2 = dr_chain[2];
5284           /* Create interleaving stmt:
5285              low = VEC_PERM_EXPR <vect1, vect2,
5286                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5287                                    6, 7, nelt + j + 2, ...}>  */
5288           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5289           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5290                                            vect2, perm3_mask_high);
5291           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5292           (*result_chain)[j] = data_ref;
5293         }
5294     }
5295   else
5296     {
5297       /* If length is not equal to 3 then only power of 2 is supported.  */
5298       gcc_assert (pow2p_hwi (length));
5299
5300       /* The encoding has 2 interleaved stepped patterns.  */
5301       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5302       vec_perm_builder sel (nelt, 2, 3);
5303       sel.quick_grow (6);
5304       for (i = 0; i < 3; i++)
5305         {
5306           sel[i * 2] = i;
5307           sel[i * 2 + 1] = i + nelt;
5308         }
5309         vec_perm_indices indices (sel, 2, nelt);
5310         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5311
5312         for (i = 0; i < 6; i++)
5313           sel[i] += exact_div (nelt, 2);
5314         indices.new_vector (sel, 2, nelt);
5315         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5316
5317         for (i = 0, n = log_length; i < n; i++)
5318           {
5319             for (j = 0; j < length/2; j++)
5320               {
5321                 vect1 = dr_chain[j];
5322                 vect2 = dr_chain[j+length/2];
5323
5324                 /* Create interleaving stmt:
5325                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5326                                                         ...}>  */
5327                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5328                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5329                                                  vect2, perm_mask_high);
5330                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5331                 (*result_chain)[2*j] = high;
5332
5333                 /* Create interleaving stmt:
5334                    low = VEC_PERM_EXPR <vect1, vect2,
5335                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5336                                          ...}>  */
5337                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5338                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5339                                                  vect2, perm_mask_low);
5340                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5341                 (*result_chain)[2*j+1] = low;
5342               }
5343             memcpy (dr_chain.address (), result_chain->address (),
5344                     length * sizeof (tree));
5345           }
5346     }
5347 }
5348
5349 /* Function vect_setup_realignment
5350
5351    This function is called when vectorizing an unaligned load using
5352    the dr_explicit_realign[_optimized] scheme.
5353    This function generates the following code at the loop prolog:
5354
5355       p = initial_addr;
5356    x  msq_init = *(floor(p));   # prolog load
5357       realignment_token = call target_builtin;
5358     loop:
5359    x  msq = phi (msq_init, ---)
5360
5361    The stmts marked with x are generated only for the case of
5362    dr_explicit_realign_optimized.
5363
5364    The code above sets up a new (vector) pointer, pointing to the first
5365    location accessed by STMT_INFO, and a "floor-aligned" load using that
5366    pointer.  It also generates code to compute the "realignment-token"
5367    (if the relevant target hook was defined), and creates a phi-node at the
5368    loop-header bb whose arguments are the result of the prolog-load (created
5369    by this function) and the result of a load that takes place in the loop
5370    (to be created by the caller to this function).
5371
5372    For the case of dr_explicit_realign_optimized:
5373    The caller to this function uses the phi-result (msq) to create the
5374    realignment code inside the loop, and sets up the missing phi argument,
5375    as follows:
5376     loop:
5377       msq = phi (msq_init, lsq)
5378       lsq = *(floor(p'));        # load in loop
5379       result = realign_load (msq, lsq, realignment_token);
5380
5381    For the case of dr_explicit_realign:
5382     loop:
5383       msq = *(floor(p));        # load in loop
5384       p' = p + (VS-1);
5385       lsq = *(floor(p'));       # load in loop
5386       result = realign_load (msq, lsq, realignment_token);
5387
5388    Input:
5389    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5390                a memory location that may be unaligned.
5391    BSI - place where new code is to be inserted.
5392    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5393                               is used.
5394
5395    Output:
5396    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5397                        target hook, if defined.
5398    Return value - the result of the loop-header phi node.  */
5399
5400 tree
5401 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5402                         tree *realignment_token,
5403                         enum dr_alignment_support alignment_support_scheme,
5404                         tree init_addr,
5405                         struct loop **at_loop)
5406 {
5407   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5408   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5409   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5410   struct loop *loop = NULL;
5411   edge pe = NULL;
5412   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5413   tree vec_dest;
5414   gimple *inc;
5415   tree ptr;
5416   tree data_ref;
5417   basic_block new_bb;
5418   tree msq_init = NULL_TREE;
5419   tree new_temp;
5420   gphi *phi_stmt;
5421   tree msq = NULL_TREE;
5422   gimple_seq stmts = NULL;
5423   bool inv_p;
5424   bool compute_in_loop = false;
5425   bool nested_in_vect_loop = false;
5426   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5427   struct loop *loop_for_initial_load = NULL;
5428
5429   if (loop_vinfo)
5430     {
5431       loop = LOOP_VINFO_LOOP (loop_vinfo);
5432       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5433     }
5434
5435   gcc_assert (alignment_support_scheme == dr_explicit_realign
5436               || alignment_support_scheme == dr_explicit_realign_optimized);
5437
5438   /* We need to generate three things:
5439      1. the misalignment computation
5440      2. the extra vector load (for the optimized realignment scheme).
5441      3. the phi node for the two vectors from which the realignment is
5442       done (for the optimized realignment scheme).  */
5443
5444   /* 1. Determine where to generate the misalignment computation.
5445
5446      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5447      calculation will be generated by this function, outside the loop (in the
5448      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5449      caller, inside the loop.
5450
5451      Background: If the misalignment remains fixed throughout the iterations of
5452      the loop, then both realignment schemes are applicable, and also the
5453      misalignment computation can be done outside LOOP.  This is because we are
5454      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5455      are a multiple of VS (the Vector Size), and therefore the misalignment in
5456      different vectorized LOOP iterations is always the same.
5457      The problem arises only if the memory access is in an inner-loop nested
5458      inside LOOP, which is now being vectorized using outer-loop vectorization.
5459      This is the only case when the misalignment of the memory access may not
5460      remain fixed throughout the iterations of the inner-loop (as explained in
5461      detail in vect_supportable_dr_alignment).  In this case, not only is the
5462      optimized realignment scheme not applicable, but also the misalignment
5463      computation (and generation of the realignment token that is passed to
5464      REALIGN_LOAD) have to be done inside the loop.
5465
5466      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5467      or not, which in turn determines if the misalignment is computed inside
5468      the inner-loop, or outside LOOP.  */
5469
5470   if (init_addr != NULL_TREE || !loop_vinfo)
5471     {
5472       compute_in_loop = true;
5473       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5474     }
5475
5476
5477   /* 2. Determine where to generate the extra vector load.
5478
5479      For the optimized realignment scheme, instead of generating two vector
5480      loads in each iteration, we generate a single extra vector load in the
5481      preheader of the loop, and in each iteration reuse the result of the
5482      vector load from the previous iteration.  In case the memory access is in
5483      an inner-loop nested inside LOOP, which is now being vectorized using
5484      outer-loop vectorization, we need to determine whether this initial vector
5485      load should be generated at the preheader of the inner-loop, or can be
5486      generated at the preheader of LOOP.  If the memory access has no evolution
5487      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5488      to be generated inside LOOP (in the preheader of the inner-loop).  */
5489
5490   if (nested_in_vect_loop)
5491     {
5492       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5493       bool invariant_in_outerloop =
5494             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5495       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5496     }
5497   else
5498     loop_for_initial_load = loop;
5499   if (at_loop)
5500     *at_loop = loop_for_initial_load;
5501
5502   if (loop_for_initial_load)
5503     pe = loop_preheader_edge (loop_for_initial_load);
5504
5505   /* 3. For the case of the optimized realignment, create the first vector
5506       load at the loop preheader.  */
5507
5508   if (alignment_support_scheme == dr_explicit_realign_optimized)
5509     {
5510       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5511       gassign *new_stmt;
5512
5513       gcc_assert (!compute_in_loop);
5514       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5515       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5516                                       loop_for_initial_load, NULL_TREE,
5517                                       &init_addr, NULL, &inc, true, &inv_p);
5518       if (TREE_CODE (ptr) == SSA_NAME)
5519         new_temp = copy_ssa_name (ptr);
5520       else
5521         new_temp = make_ssa_name (TREE_TYPE (ptr));
5522       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5523       new_stmt = gimple_build_assign
5524                    (new_temp, BIT_AND_EXPR, ptr,
5525                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5526       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5527       gcc_assert (!new_bb);
5528       data_ref
5529         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5530                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5531       vect_copy_ref_info (data_ref, DR_REF (dr));
5532       new_stmt = gimple_build_assign (vec_dest, data_ref);
5533       new_temp = make_ssa_name (vec_dest, new_stmt);
5534       gimple_assign_set_lhs (new_stmt, new_temp);
5535       if (pe)
5536         {
5537           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5538           gcc_assert (!new_bb);
5539         }
5540       else
5541          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5542
5543       msq_init = gimple_assign_lhs (new_stmt);
5544     }
5545
5546   /* 4. Create realignment token using a target builtin, if available.
5547       It is done either inside the containing loop, or before LOOP (as
5548       determined above).  */
5549
5550   if (targetm.vectorize.builtin_mask_for_load)
5551     {
5552       gcall *new_stmt;
5553       tree builtin_decl;
5554
5555       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5556       if (!init_addr)
5557         {
5558           /* Generate the INIT_ADDR computation outside LOOP.  */
5559           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5560                                                             NULL_TREE);
5561           if (loop)
5562             {
5563               pe = loop_preheader_edge (loop);
5564               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5565               gcc_assert (!new_bb);
5566             }
5567           else
5568              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5569         }
5570
5571       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5572       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5573       vec_dest =
5574         vect_create_destination_var (scalar_dest,
5575                                      gimple_call_return_type (new_stmt));
5576       new_temp = make_ssa_name (vec_dest, new_stmt);
5577       gimple_call_set_lhs (new_stmt, new_temp);
5578
5579       if (compute_in_loop)
5580         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5581       else
5582         {
5583           /* Generate the misalignment computation outside LOOP.  */
5584           pe = loop_preheader_edge (loop);
5585           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5586           gcc_assert (!new_bb);
5587         }
5588
5589       *realignment_token = gimple_call_lhs (new_stmt);
5590
5591       /* The result of the CALL_EXPR to this builtin is determined from
5592          the value of the parameter and no global variables are touched
5593          which makes the builtin a "const" function.  Requiring the
5594          builtin to have the "const" attribute makes it unnecessary
5595          to call mark_call_clobbered.  */
5596       gcc_assert (TREE_READONLY (builtin_decl));
5597     }
5598
5599   if (alignment_support_scheme == dr_explicit_realign)
5600     return msq;
5601
5602   gcc_assert (!compute_in_loop);
5603   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5604
5605
5606   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5607
5608   pe = loop_preheader_edge (containing_loop);
5609   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5610   msq = make_ssa_name (vec_dest);
5611   phi_stmt = create_phi_node (msq, containing_loop->header);
5612   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5613
5614   return msq;
5615 }
5616
5617
5618 /* Function vect_grouped_load_supported.
5619
5620    COUNT is the size of the load group (the number of statements plus the
5621    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5622    only one statement, with a gap of COUNT - 1.
5623
5624    Returns true if a suitable permute exists.  */
5625
5626 bool
5627 vect_grouped_load_supported (tree vectype, bool single_element_p,
5628                              unsigned HOST_WIDE_INT count)
5629 {
5630   machine_mode mode = TYPE_MODE (vectype);
5631
5632   /* If this is single-element interleaving with an element distance
5633      that leaves unused vector loads around punt - we at least create
5634      very sub-optimal code in that case (and blow up memory,
5635      see PR65518).  */
5636   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5637     {
5638       if (dump_enabled_p ())
5639         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5640                          "single-element interleaving not supported "
5641                          "for not adjacent vector loads\n");
5642       return false;
5643     }
5644
5645   /* vect_permute_load_chain requires the group size to be equal to 3 or
5646      be a power of two.  */
5647   if (count != 3 && exact_log2 (count) == -1)
5648     {
5649       if (dump_enabled_p ())
5650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5651                          "the size of the group of accesses"
5652                          " is not a power of 2 or not equal to 3\n");
5653       return false;
5654     }
5655
5656   /* Check that the permutation is supported.  */
5657   if (VECTOR_MODE_P (mode))
5658     {
5659       unsigned int i, j;
5660       if (count == 3)
5661         {
5662           unsigned int nelt;
5663           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5664             {
5665               if (dump_enabled_p ())
5666                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667                                  "cannot handle groups of 3 loads for"
5668                                  " variable-length vectors\n");
5669               return false;
5670             }
5671
5672           vec_perm_builder sel (nelt, nelt, 1);
5673           sel.quick_grow (nelt);
5674           vec_perm_indices indices;
5675           unsigned int k;
5676           for (k = 0; k < 3; k++)
5677             {
5678               for (i = 0; i < nelt; i++)
5679                 if (3 * i + k < 2 * nelt)
5680                   sel[i] = 3 * i + k;
5681                 else
5682                   sel[i] = 0;
5683               indices.new_vector (sel, 2, nelt);
5684               if (!can_vec_perm_const_p (mode, indices))
5685                 {
5686                   if (dump_enabled_p ())
5687                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5688                                      "shuffle of 3 loads is not supported by"
5689                                      " target\n");
5690                   return false;
5691                 }
5692               for (i = 0, j = 0; i < nelt; i++)
5693                 if (3 * i + k < 2 * nelt)
5694                   sel[i] = i;
5695                 else
5696                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5697               indices.new_vector (sel, 2, nelt);
5698               if (!can_vec_perm_const_p (mode, indices))
5699                 {
5700                   if (dump_enabled_p ())
5701                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702                                      "shuffle of 3 loads is not supported by"
5703                                      " target\n");
5704                   return false;
5705                 }
5706             }
5707           return true;
5708         }
5709       else
5710         {
5711           /* If length is not equal to 3 then only power of 2 is supported.  */
5712           gcc_assert (pow2p_hwi (count));
5713           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5714
5715           /* The encoding has a single stepped pattern.  */
5716           vec_perm_builder sel (nelt, 1, 3);
5717           sel.quick_grow (3);
5718           for (i = 0; i < 3; i++)
5719             sel[i] = i * 2;
5720           vec_perm_indices indices (sel, 2, nelt);
5721           if (can_vec_perm_const_p (mode, indices))
5722             {
5723               for (i = 0; i < 3; i++)
5724                 sel[i] = i * 2 + 1;
5725               indices.new_vector (sel, 2, nelt);
5726               if (can_vec_perm_const_p (mode, indices))
5727                 return true;
5728             }
5729         }
5730     }
5731
5732   if (dump_enabled_p ())
5733     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734                      "extract even/odd not supported by target\n");
5735   return false;
5736 }
5737
5738 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5739    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5740
5741 bool
5742 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5743                            bool masked_p)
5744 {
5745   if (masked_p)
5746     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5747                                          vec_mask_load_lanes_optab,
5748                                          vectype, count);
5749   else
5750     return vect_lanes_optab_supported_p ("vec_load_lanes",
5751                                          vec_load_lanes_optab,
5752                                          vectype, count);
5753 }
5754
5755 /* Function vect_permute_load_chain.
5756
5757    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5758    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5759    the input data correctly.  Return the final references for loads in
5760    RESULT_CHAIN.
5761
5762    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5763    The input is 4 vectors each containing 8 elements. We assign a number to each
5764    element, the input sequence is:
5765
5766    1st vec:   0  1  2  3  4  5  6  7
5767    2nd vec:   8  9 10 11 12 13 14 15
5768    3rd vec:  16 17 18 19 20 21 22 23
5769    4th vec:  24 25 26 27 28 29 30 31
5770
5771    The output sequence should be:
5772
5773    1st vec:  0 4  8 12 16 20 24 28
5774    2nd vec:  1 5  9 13 17 21 25 29
5775    3rd vec:  2 6 10 14 18 22 26 30
5776    4th vec:  3 7 11 15 19 23 27 31
5777
5778    i.e., the first output vector should contain the first elements of each
5779    interleaving group, etc.
5780
5781    We use extract_even/odd instructions to create such output.  The input of
5782    each extract_even/odd operation is two vectors
5783    1st vec    2nd vec
5784    0 1 2 3    4 5 6 7
5785
5786    and the output is the vector of extracted even/odd elements.  The output of
5787    extract_even will be:   0 2 4 6
5788    and of extract_odd:     1 3 5 7
5789
5790
5791    The permutation is done in log LENGTH stages.  In each stage extract_even
5792    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5793    their order.  In our example,
5794
5795    E1: extract_even (1st vec, 2nd vec)
5796    E2: extract_odd (1st vec, 2nd vec)
5797    E3: extract_even (3rd vec, 4th vec)
5798    E4: extract_odd (3rd vec, 4th vec)
5799
5800    The output for the first stage will be:
5801
5802    E1:  0  2  4  6  8 10 12 14
5803    E2:  1  3  5  7  9 11 13 15
5804    E3: 16 18 20 22 24 26 28 30
5805    E4: 17 19 21 23 25 27 29 31
5806
5807    In order to proceed and create the correct sequence for the next stage (or
5808    for the correct output, if the second stage is the last one, as in our
5809    example), we first put the output of extract_even operation and then the
5810    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5811    The input for the second stage is:
5812
5813    1st vec (E1):  0  2  4  6  8 10 12 14
5814    2nd vec (E3): 16 18 20 22 24 26 28 30
5815    3rd vec (E2):  1  3  5  7  9 11 13 15
5816    4th vec (E4): 17 19 21 23 25 27 29 31
5817
5818    The output of the second stage:
5819
5820    E1: 0 4  8 12 16 20 24 28
5821    E2: 2 6 10 14 18 22 26 30
5822    E3: 1 5  9 13 17 21 25 29
5823    E4: 3 7 11 15 19 23 27 31
5824
5825    And RESULT_CHAIN after reordering:
5826
5827    1st vec (E1):  0 4  8 12 16 20 24 28
5828    2nd vec (E3):  1 5  9 13 17 21 25 29
5829    3rd vec (E2):  2 6 10 14 18 22 26 30
5830    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5831
5832 static void
5833 vect_permute_load_chain (vec<tree> dr_chain,
5834                          unsigned int length,
5835                          stmt_vec_info stmt_info,
5836                          gimple_stmt_iterator *gsi,
5837                          vec<tree> *result_chain)
5838 {
5839   tree data_ref, first_vect, second_vect;
5840   tree perm_mask_even, perm_mask_odd;
5841   tree perm3_mask_low, perm3_mask_high;
5842   gimple *perm_stmt;
5843   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5844   unsigned int i, j, log_length = exact_log2 (length);
5845
5846   result_chain->quick_grow (length);
5847   memcpy (result_chain->address (), dr_chain.address (),
5848           length * sizeof (tree));
5849
5850   if (length == 3)
5851     {
5852       /* vect_grouped_load_supported ensures that this is constant.  */
5853       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5854       unsigned int k;
5855
5856       vec_perm_builder sel (nelt, nelt, 1);
5857       sel.quick_grow (nelt);
5858       vec_perm_indices indices;
5859       for (k = 0; k < 3; k++)
5860         {
5861           for (i = 0; i < nelt; i++)
5862             if (3 * i + k < 2 * nelt)
5863               sel[i] = 3 * i + k;
5864             else
5865               sel[i] = 0;
5866           indices.new_vector (sel, 2, nelt);
5867           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5868
5869           for (i = 0, j = 0; i < nelt; i++)
5870             if (3 * i + k < 2 * nelt)
5871               sel[i] = i;
5872             else
5873               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5874           indices.new_vector (sel, 2, nelt);
5875           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5876
5877           first_vect = dr_chain[0];
5878           second_vect = dr_chain[1];
5879
5880           /* Create interleaving stmt (low part of):
5881              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5882                                                              ...}>  */
5883           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5884           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5885                                            second_vect, perm3_mask_low);
5886           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5887
5888           /* Create interleaving stmt (high part of):
5889              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5890                                                               ...}>  */
5891           first_vect = data_ref;
5892           second_vect = dr_chain[2];
5893           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5894           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5895                                            second_vect, perm3_mask_high);
5896           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5897           (*result_chain)[k] = data_ref;
5898         }
5899     }
5900   else
5901     {
5902       /* If length is not equal to 3 then only power of 2 is supported.  */
5903       gcc_assert (pow2p_hwi (length));
5904
5905       /* The encoding has a single stepped pattern.  */
5906       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5907       vec_perm_builder sel (nelt, 1, 3);
5908       sel.quick_grow (3);
5909       for (i = 0; i < 3; ++i)
5910         sel[i] = i * 2;
5911       vec_perm_indices indices (sel, 2, nelt);
5912       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5913
5914       for (i = 0; i < 3; ++i)
5915         sel[i] = i * 2 + 1;
5916       indices.new_vector (sel, 2, nelt);
5917       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5918
5919       for (i = 0; i < log_length; i++)
5920         {
5921           for (j = 0; j < length; j += 2)
5922             {
5923               first_vect = dr_chain[j];
5924               second_vect = dr_chain[j+1];
5925
5926               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5927               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5928               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5929                                                first_vect, second_vect,
5930                                                perm_mask_even);
5931               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5932               (*result_chain)[j/2] = data_ref;
5933
5934               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5935               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5936               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5937                                                first_vect, second_vect,
5938                                                perm_mask_odd);
5939               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5940               (*result_chain)[j/2+length/2] = data_ref;
5941             }
5942           memcpy (dr_chain.address (), result_chain->address (),
5943                   length * sizeof (tree));
5944         }
5945     }
5946 }
5947
5948 /* Function vect_shift_permute_load_chain.
5949
5950    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5951    sequence of stmts to reorder the input data accordingly.
5952    Return the final references for loads in RESULT_CHAIN.
5953    Return true if successed, false otherwise.
5954
5955    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5956    The input is 3 vectors each containing 8 elements.  We assign a
5957    number to each element, the input sequence is:
5958
5959    1st vec:   0  1  2  3  4  5  6  7
5960    2nd vec:   8  9 10 11 12 13 14 15
5961    3rd vec:  16 17 18 19 20 21 22 23
5962
5963    The output sequence should be:
5964
5965    1st vec:  0 3 6  9 12 15 18 21
5966    2nd vec:  1 4 7 10 13 16 19 22
5967    3rd vec:  2 5 8 11 14 17 20 23
5968
5969    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5970
5971    First we shuffle all 3 vectors to get correct elements order:
5972
5973    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5974    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5975    3rd vec:  (16 19 22) (17 20 23) (18 21)
5976
5977    Next we unite and shift vector 3 times:
5978
5979    1st step:
5980      shift right by 6 the concatenation of:
5981      "1st vec" and  "2nd vec"
5982        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5983      "2nd vec" and  "3rd vec"
5984        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5985      "3rd vec" and  "1st vec"
5986        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5987                              | New vectors                   |
5988
5989      So that now new vectors are:
5990
5991      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5992      2nd vec:  (10 13) (16 19 22) (17 20 23)
5993      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5994
5995    2nd step:
5996      shift right by 5 the concatenation of:
5997      "1st vec" and  "3rd vec"
5998        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5999      "2nd vec" and  "1st vec"
6000        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6001      "3rd vec" and  "2nd vec"
6002        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6003                           | New vectors                   |
6004
6005      So that now new vectors are:
6006
6007      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6008      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6009      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6010
6011    3rd step:
6012      shift right by 5 the concatenation of:
6013      "1st vec" and  "1st vec"
6014        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6015      shift right by 3 the concatenation of:
6016      "2nd vec" and  "2nd vec"
6017                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6018                           | New vectors                   |
6019
6020      So that now all vectors are READY:
6021      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6022      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6023      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6024
6025    This algorithm is faster than one in vect_permute_load_chain if:
6026      1.  "shift of a concatination" is faster than general permutation.
6027          This is usually so.
6028      2.  The TARGET machine can't execute vector instructions in parallel.
6029          This is because each step of the algorithm depends on previous.
6030          The algorithm in vect_permute_load_chain is much more parallel.
6031
6032    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6033 */
6034
6035 static bool
6036 vect_shift_permute_load_chain (vec<tree> dr_chain,
6037                                unsigned int length,
6038                                stmt_vec_info stmt_info,
6039                                gimple_stmt_iterator *gsi,
6040                                vec<tree> *result_chain)
6041 {
6042   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6043   tree perm2_mask1, perm2_mask2, perm3_mask;
6044   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6045   gimple *perm_stmt;
6046
6047   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6048   unsigned int i;
6049   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6050
6051   unsigned HOST_WIDE_INT nelt, vf;
6052   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6053       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6054     /* Not supported for variable-length vectors.  */
6055     return false;
6056
6057   vec_perm_builder sel (nelt, nelt, 1);
6058   sel.quick_grow (nelt);
6059
6060   result_chain->quick_grow (length);
6061   memcpy (result_chain->address (), dr_chain.address (),
6062           length * sizeof (tree));
6063
6064   if (pow2p_hwi (length) && vf > 4)
6065     {
6066       unsigned int j, log_length = exact_log2 (length);
6067       for (i = 0; i < nelt / 2; ++i)
6068         sel[i] = i * 2;
6069       for (i = 0; i < nelt / 2; ++i)
6070         sel[nelt / 2 + i] = i * 2 + 1;
6071       vec_perm_indices indices (sel, 2, nelt);
6072       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6073         {
6074           if (dump_enabled_p ())
6075             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6076                              "shuffle of 2 fields structure is not \
6077                               supported by target\n");
6078           return false;
6079         }
6080       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6081
6082       for (i = 0; i < nelt / 2; ++i)
6083         sel[i] = i * 2 + 1;
6084       for (i = 0; i < nelt / 2; ++i)
6085         sel[nelt / 2 + i] = i * 2;
6086       indices.new_vector (sel, 2, nelt);
6087       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6088         {
6089           if (dump_enabled_p ())
6090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6091                              "shuffle of 2 fields structure is not \
6092                               supported by target\n");
6093           return false;
6094         }
6095       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6096
6097       /* Generating permutation constant to shift all elements.
6098          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6099       for (i = 0; i < nelt; i++)
6100         sel[i] = nelt / 2 + i;
6101       indices.new_vector (sel, 2, nelt);
6102       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6103         {
6104           if (dump_enabled_p ())
6105             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106                              "shift permutation is not supported by target\n");
6107           return false;
6108         }
6109       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6110
6111       /* Generating permutation constant to select vector from 2.
6112          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6113       for (i = 0; i < nelt / 2; i++)
6114         sel[i] = i;
6115       for (i = nelt / 2; i < nelt; i++)
6116         sel[i] = nelt + i;
6117       indices.new_vector (sel, 2, nelt);
6118       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6119         {
6120           if (dump_enabled_p ())
6121             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6122                              "select is not supported by target\n");
6123           return false;
6124         }
6125       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6126
6127       for (i = 0; i < log_length; i++)
6128         {
6129           for (j = 0; j < length; j += 2)
6130             {
6131               first_vect = dr_chain[j];
6132               second_vect = dr_chain[j + 1];
6133
6134               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6135               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6136                                                first_vect, first_vect,
6137                                                perm2_mask1);
6138               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6139               vect[0] = data_ref;
6140
6141               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6142               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6143                                                second_vect, second_vect,
6144                                                perm2_mask2);
6145               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6146               vect[1] = data_ref;
6147
6148               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6149               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6150                                                vect[0], vect[1], shift1_mask);
6151               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6152               (*result_chain)[j/2 + length/2] = data_ref;
6153
6154               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6155               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6156                                                vect[0], vect[1], select_mask);
6157               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6158               (*result_chain)[j/2] = data_ref;
6159             }
6160           memcpy (dr_chain.address (), result_chain->address (),
6161                   length * sizeof (tree));
6162         }
6163       return true;
6164     }
6165   if (length == 3 && vf > 2)
6166     {
6167       unsigned int k = 0, l = 0;
6168
6169       /* Generating permutation constant to get all elements in rigth order.
6170          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6171       for (i = 0; i < nelt; i++)
6172         {
6173           if (3 * k + (l % 3) >= nelt)
6174             {
6175               k = 0;
6176               l += (3 - (nelt % 3));
6177             }
6178           sel[i] = 3 * k + (l % 3);
6179           k++;
6180         }
6181       vec_perm_indices indices (sel, 2, nelt);
6182       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6183         {
6184           if (dump_enabled_p ())
6185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6186                              "shuffle of 3 fields structure is not \
6187                               supported by target\n");
6188           return false;
6189         }
6190       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6191
6192       /* Generating permutation constant to shift all elements.
6193          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6194       for (i = 0; i < nelt; i++)
6195         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6196       indices.new_vector (sel, 2, nelt);
6197       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6198         {
6199           if (dump_enabled_p ())
6200             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201                              "shift permutation is not supported by target\n");
6202           return false;
6203         }
6204       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6205
6206       /* Generating permutation constant to shift all elements.
6207          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6208       for (i = 0; i < nelt; i++)
6209         sel[i] = 2 * (nelt / 3) + 1 + i;
6210       indices.new_vector (sel, 2, nelt);
6211       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6212         {
6213           if (dump_enabled_p ())
6214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215                              "shift permutation is not supported by target\n");
6216           return false;
6217         }
6218       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6219
6220       /* Generating permutation constant to shift all elements.
6221          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6222       for (i = 0; i < nelt; i++)
6223         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6224       indices.new_vector (sel, 2, nelt);
6225       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6226         {
6227           if (dump_enabled_p ())
6228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229                              "shift permutation is not supported by target\n");
6230           return false;
6231         }
6232       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6233
6234       /* Generating permutation constant to shift all elements.
6235          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6236       for (i = 0; i < nelt; i++)
6237         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6238       indices.new_vector (sel, 2, nelt);
6239       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6240         {
6241           if (dump_enabled_p ())
6242             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6243                              "shift permutation is not supported by target\n");
6244           return false;
6245         }
6246       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6247
6248       for (k = 0; k < 3; k++)
6249         {
6250           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6251           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6252                                            dr_chain[k], dr_chain[k],
6253                                            perm3_mask);
6254           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6255           vect[k] = data_ref;
6256         }
6257
6258       for (k = 0; k < 3; k++)
6259         {
6260           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6261           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6262                                            vect[k % 3], vect[(k + 1) % 3],
6263                                            shift1_mask);
6264           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6265           vect_shift[k] = data_ref;
6266         }
6267
6268       for (k = 0; k < 3; k++)
6269         {
6270           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6271           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6272                                            vect_shift[(4 - k) % 3],
6273                                            vect_shift[(3 - k) % 3],
6274                                            shift2_mask);
6275           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6276           vect[k] = data_ref;
6277         }
6278
6279       (*result_chain)[3 - (nelt % 3)] = vect[2];
6280
6281       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6282       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6283                                        vect[0], shift3_mask);
6284       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6285       (*result_chain)[nelt % 3] = data_ref;
6286
6287       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6288       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6289                                        vect[1], shift4_mask);
6290       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6291       (*result_chain)[0] = data_ref;
6292       return true;
6293     }
6294   return false;
6295 }
6296
6297 /* Function vect_transform_grouped_load.
6298
6299    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6300    to perform their permutation and ascribe the result vectorized statements to
6301    the scalar statements.
6302 */
6303
6304 void
6305 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6306                              int size, gimple_stmt_iterator *gsi)
6307 {
6308   machine_mode mode;
6309   vec<tree> result_chain = vNULL;
6310
6311   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6312      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6313      vectors, that are ready for vector computation.  */
6314   result_chain.create (size);
6315
6316   /* If reassociation width for vector type is 2 or greater target machine can
6317      execute 2 or more vector instructions in parallel.  Otherwise try to
6318      get chain for loads group using vect_shift_permute_load_chain.  */
6319   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6320   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6321       || pow2p_hwi (size)
6322       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6323                                          gsi, &result_chain))
6324     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6325   vect_record_grouped_load_vectors (stmt_info, result_chain);
6326   result_chain.release ();
6327 }
6328
6329 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6330    generated as part of the vectorization of STMT_INFO.  Assign the statement
6331    for each vector to the associated scalar statement.  */
6332
6333 void
6334 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6335                                   vec<tree> result_chain)
6336 {
6337   vec_info *vinfo = stmt_info->vinfo;
6338   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6339   unsigned int i, gap_count;
6340   tree tmp_data_ref;
6341
6342   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6343      Since we scan the chain starting from it's first node, their order
6344      corresponds the order of data-refs in RESULT_CHAIN.  */
6345   stmt_vec_info next_stmt_info = first_stmt_info;
6346   gap_count = 1;
6347   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6348     {
6349       if (!next_stmt_info)
6350         break;
6351
6352       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6353        code elimination pass later.  No need to check for the first stmt in
6354        the group, since it always exists.
6355        DR_GROUP_GAP is the number of steps in elements from the previous
6356        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6357        correspond to the gaps.  */
6358       if (next_stmt_info != first_stmt_info
6359           && gap_count < DR_GROUP_GAP (next_stmt_info))
6360       {
6361         gap_count++;
6362         continue;
6363       }
6364
6365       while (next_stmt_info)
6366         {
6367           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6368           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6369              copies, and we put the new vector statement in the first available
6370              RELATED_STMT.  */
6371           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6372             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6373           else
6374             {
6375               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6376                 {
6377                   stmt_vec_info prev_stmt_info
6378                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6379                   stmt_vec_info rel_stmt_info
6380                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6381                   while (rel_stmt_info)
6382                     {
6383                       prev_stmt_info = rel_stmt_info;
6384                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6385                     }
6386
6387                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6388                 }
6389             }
6390
6391           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6392           gap_count = 1;
6393           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6394              put the same TMP_DATA_REF as its vectorized statement; otherwise
6395              get the next data-ref from RESULT_CHAIN.  */
6396           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6397             break;
6398         }
6399     }
6400 }
6401
6402 /* Function vect_force_dr_alignment_p.
6403
6404    Returns whether the alignment of a DECL can be forced to be aligned
6405    on ALIGNMENT bit boundary.  */
6406
6407 bool
6408 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6409 {
6410   if (!VAR_P (decl))
6411     return false;
6412
6413   if (decl_in_symtab_p (decl)
6414       && !symtab_node::get (decl)->can_increase_alignment_p ())
6415     return false;
6416
6417   if (TREE_STATIC (decl))
6418     return (alignment <= MAX_OFILE_ALIGNMENT);
6419   else
6420     return (alignment <= MAX_STACK_ALIGNMENT);
6421 }
6422
6423
6424 /* Return whether the data reference DR is supported with respect to its
6425    alignment.
6426    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6427    it is aligned, i.e., check if it is possible to vectorize it with different
6428    alignment.  */
6429
6430 enum dr_alignment_support
6431 vect_supportable_dr_alignment (struct data_reference *dr,
6432                                bool check_aligned_accesses)
6433 {
6434   stmt_vec_info stmt_info = vect_dr_stmt (dr);
6435   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6436   machine_mode mode = TYPE_MODE (vectype);
6437   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438   struct loop *vect_loop = NULL;
6439   bool nested_in_vect_loop = false;
6440
6441   if (aligned_access_p (dr) && !check_aligned_accesses)
6442     return dr_aligned;
6443
6444   /* For now assume all conditional loads/stores support unaligned
6445      access without any special code.  */
6446   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6447     if (gimple_call_internal_p (stmt)
6448         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6449             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6450       return dr_unaligned_supported;
6451
6452   if (loop_vinfo)
6453     {
6454       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6455       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6456     }
6457
6458   /* Possibly unaligned access.  */
6459
6460   /* We can choose between using the implicit realignment scheme (generating
6461      a misaligned_move stmt) and the explicit realignment scheme (generating
6462      aligned loads with a REALIGN_LOAD).  There are two variants to the
6463      explicit realignment scheme: optimized, and unoptimized.
6464      We can optimize the realignment only if the step between consecutive
6465      vector loads is equal to the vector size.  Since the vector memory
6466      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6467      is guaranteed that the misalignment amount remains the same throughout the
6468      execution of the vectorized loop.  Therefore, we can create the
6469      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6470      at the loop preheader.
6471
6472      However, in the case of outer-loop vectorization, when vectorizing a
6473      memory access in the inner-loop nested within the LOOP that is now being
6474      vectorized, while it is guaranteed that the misalignment of the
6475      vectorized memory access will remain the same in different outer-loop
6476      iterations, it is *not* guaranteed that is will remain the same throughout
6477      the execution of the inner-loop.  This is because the inner-loop advances
6478      with the original scalar step (and not in steps of VS).  If the inner-loop
6479      step happens to be a multiple of VS, then the misalignment remains fixed
6480      and we can use the optimized realignment scheme.  For example:
6481
6482       for (i=0; i<N; i++)
6483         for (j=0; j<M; j++)
6484           s += a[i+j];
6485
6486      When vectorizing the i-loop in the above example, the step between
6487      consecutive vector loads is 1, and so the misalignment does not remain
6488      fixed across the execution of the inner-loop, and the realignment cannot
6489      be optimized (as illustrated in the following pseudo vectorized loop):
6490
6491       for (i=0; i<N; i+=4)
6492         for (j=0; j<M; j++){
6493           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6494                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6495                          // (assuming that we start from an aligned address).
6496           }
6497
6498      We therefore have to use the unoptimized realignment scheme:
6499
6500       for (i=0; i<N; i+=4)
6501           for (j=k; j<M; j+=4)
6502           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6503                            // that the misalignment of the initial address is
6504                            // 0).
6505
6506      The loop can then be vectorized as follows:
6507
6508       for (k=0; k<4; k++){
6509         rt = get_realignment_token (&vp[k]);
6510         for (i=0; i<N; i+=4){
6511           v1 = vp[i+k];
6512           for (j=k; j<M; j+=4){
6513             v2 = vp[i+j+VS-1];
6514             va = REALIGN_LOAD <v1,v2,rt>;
6515             vs += va;
6516             v1 = v2;
6517           }
6518         }
6519     } */
6520
6521   if (DR_IS_READ (dr))
6522     {
6523       bool is_packed = false;
6524       tree type = (TREE_TYPE (DR_REF (dr)));
6525
6526       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6527           && (!targetm.vectorize.builtin_mask_for_load
6528               || targetm.vectorize.builtin_mask_for_load ()))
6529         {
6530           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6531
6532           /* If we are doing SLP then the accesses need not have the
6533              same alignment, instead it depends on the SLP group size.  */
6534           if (loop_vinfo
6535               && STMT_SLP_TYPE (stmt_info)
6536               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6537                               * (DR_GROUP_SIZE
6538                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6539                               TYPE_VECTOR_SUBPARTS (vectype)))
6540             ;
6541           else if (!loop_vinfo
6542                    || (nested_in_vect_loop
6543                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6544                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6545             return dr_explicit_realign;
6546           else
6547             return dr_explicit_realign_optimized;
6548         }
6549       if (!known_alignment_for_access_p (dr))
6550         is_packed = not_size_aligned (DR_REF (dr));
6551
6552       if (targetm.vectorize.support_vector_misalignment
6553             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6554         /* Can't software pipeline the loads, but can at least do them.  */
6555         return dr_unaligned_supported;
6556     }
6557   else
6558     {
6559       bool is_packed = false;
6560       tree type = (TREE_TYPE (DR_REF (dr)));
6561
6562       if (!known_alignment_for_access_p (dr))
6563         is_packed = not_size_aligned (DR_REF (dr));
6564
6565      if (targetm.vectorize.support_vector_misalignment
6566            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6567        return dr_unaligned_supported;
6568     }
6569
6570   /* Unsupported.  */
6571   return dr_unaligned_unsupported;
6572 }