gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmtinfo_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmtinfo_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return (!STMT_VINFO_GATHER_SCATTER_P
 271                      (vinfo_for_stmt (DR_STMT (DDR_A (ddr))))
 272                   && !STMT_VINFO_GATHER_SCATTER_P
 273                         (vinfo_for_stmt (DR_STMT (DDR_B (ddr))))
 274                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 275         }
 276     }
 277   return true;
 278 }
 279
 280
 281 /* Function vect_analyze_data_ref_dependence.
 282
 283    Return TRUE if there (might) exist a dependence between a memory-reference
 284    DRA and a memory-reference DRB.  When versioning for alias may check a
 285    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 286    the data dependence.  */
 287
 288 static bool
 289 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 290                                   loop_vec_info loop_vinfo,
 291                                   unsigned int *max_vf)
 292 {
 293   unsigned int i;
 294   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 295   struct data_reference *dra = DDR_A (ddr);
 296   struct data_reference *drb = DDR_B (ddr);
 297   stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
 298   stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
 299   lambda_vector dist_v;
 300   unsigned int loop_depth;
 301
 302   /* In loop analysis all data references should be vectorizable.  */
 303   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 304       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 305     gcc_unreachable ();
 306
 307   /* Independent data accesses.  */
 308   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 309     return false;
 310
 311   if (dra == drb
 312       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 313     return false;
 314
 315   /* We do not have to consider dependences between accesses that belong
 316      to the same group, unless the stride could be smaller than the
 317      group size.  */
 318   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 319       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 321       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 322     return false;
 323
 324   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 325      least two scalar iterations, there is always also a true dependence.
 326      As the vectorizer does not re-order loads and stores we can ignore
 327      the anti-dependence if TBAA can disambiguate both DRs similar to the
 328      case with known negative distance anti-dependences (positive
 329      distance anti-dependences would violate TBAA constraints).  */
 330   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 331        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 332       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 333                                  get_alias_set (DR_REF (drb))))
 334     return false;
 335
 336   /* Unknown data dependence.  */
 337   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 338     {
 339       /* If user asserted safelen consecutive iterations can be
 340          executed concurrently, assume independence.  */
 341       if (loop->safelen >= 2)
 342         {
 343           if ((unsigned int) loop->safelen < *max_vf)
 344             *max_vf = loop->safelen;
 345           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 346           return false;
 347         }
 348
 349       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 350           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 351         {
 352           if (dump_enabled_p ())
 353             {
 354               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                                "versioning for alias not supported for: "
 356                                "can't determine dependence between ");
 357               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 358                                  DR_REF (dra));
 359               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 360               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 361                                  DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364           return true;
 365         }
 366
 367       if (dump_enabled_p ())
 368         {
 369           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                            "versioning for alias required: "
 371                            "can't determine dependence between ");
 372           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 373                              DR_REF (dra));
 374           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 375           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 376                              DR_REF (drb));
 377           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 378         }
 379
 380       /* Add to list of ddrs that need to be tested at run-time.  */
 381       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 382     }
 383
 384   /* Known data dependence.  */
 385   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 386     {
 387       /* If user asserted safelen consecutive iterations can be
 388          executed concurrently, assume independence.  */
 389       if (loop->safelen >= 2)
 390         {
 391           if ((unsigned int) loop->safelen < *max_vf)
 392             *max_vf = loop->safelen;
 393           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 394           return false;
 395         }
 396
 397       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 398           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 399         {
 400           if (dump_enabled_p ())
 401             {
 402               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                "versioning for alias not supported for: "
 404                                "bad dist vector for ");
 405               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                  DR_REF (dra));
 407               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 408               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 409                                  DR_REF (drb));
 410               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411             }
 412           return true;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                            "versioning for alias required: "
 419                            "bad dist vector for ");
 420           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 422           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 423           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 424         }
 425       /* Add to list of ddrs that need to be tested at run-time.  */
 426       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 427     }
 428
 429   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 430
 431   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 432       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 433                                                 loop_depth, max_vf))
 434     return false;
 435
 436   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 437     {
 438       int dist = dist_v[loop_depth];
 439
 440       if (dump_enabled_p ())
 441         dump_printf_loc (MSG_NOTE, vect_location,
 442                          "dependence distance  = %d.\n", dist);
 443
 444       if (dist == 0)
 445         {
 446           if (dump_enabled_p ())
 447             {
 448               dump_printf_loc (MSG_NOTE, vect_location,
 449                                "dependence distance == 0 between ");
 450               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 451               dump_printf (MSG_NOTE, " and ");
 452               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 453               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 454             }
 455
 456           /* When we perform grouped accesses and perform implicit CSE
 457              by detecting equal accesses and doing disambiguation with
 458              runtime alias tests like for
 459                 .. = a[i];
 460                 .. = a[i+1];
 461                 a[i] = ..;
 462                 a[i+1] = ..;
 463                 *p = ..;
 464                 .. = a[i];
 465                 .. = a[i+1];
 466              where we will end up loading { a[i], a[i+1] } once, make
 467              sure that inserting group loads before the first load and
 468              stores after the last store will do the right thing.
 469              Similar for groups like
 470                 a[i] = ...;
 471                 ... = a[i];
 472                 a[i+1] = ...;
 473              where loads from the group interleave with the store.  */
 474           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 475                                               vect_dr_stmt (drb)))
 476             {
 477               if (dump_enabled_p ())
 478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 479                                  "READ_WRITE dependence in interleaving.\n");
 480               return true;
 481             }
 482
 483           if (loop->safelen < 2)
 484             {
 485               tree indicator = dr_zero_step_indicator (dra);
 486               if (!indicator || integer_zerop (indicator))
 487                 {
 488                   if (dump_enabled_p ())
 489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                                  "access also has a zero step\n");
 491                   return true;
 492                 }
 493               else if (TREE_CODE (indicator) != INTEGER_CST)
 494                 vect_check_nonzero_value (loop_vinfo, indicator);
 495             }
 496           continue;
 497         }
 498
 499       if (dist > 0 && DDR_REVERSED_P (ddr))
 500         {
 501           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 502              reversed (to make distance vector positive), and the actual
 503              distance is negative.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 506                              "dependence distance negative.\n");
 507           /* Record a negative dependence distance to later limit the
 508              amount of stmt copying / unrolling we can perform.
 509              Only need to handle read-after-write dependence.  */
 510           if (DR_IS_READ (drb)
 511               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 512                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 513             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 514           continue;
 515         }
 516
 517       unsigned int abs_dist = abs (dist);
 518       if (abs_dist >= 2 && abs_dist < *max_vf)
 519         {
 520           /* The dependence distance requires reduction of the maximal
 521              vectorization factor.  */
 522           *max_vf = abs (dist);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "adjusting maximal vectorization factor to %i\n",
 526                              *max_vf);
 527         }
 528
 529       if (abs_dist >= *max_vf)
 530         {
 531           /* Dependence distance does not create dependence, as far as
 532              vectorization is concerned, in this case.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance >= VF.\n");
 536           continue;
 537         }
 538
 539       if (dump_enabled_p ())
 540         {
 541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 542                        "not vectorized, possible dependence "
 543                        "between data-refs ");
 544           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_NOTE,  " and ");
 546           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_NOTE,  "\n");
 548         }
 549
 550       return true;
 551     }
 552
 553   return false;
 554 }
 555
 556 /* Function vect_analyze_data_ref_dependences.
 557
 558    Examine all the data references in the loop, and make sure there do not
 559    exist any data dependences between them.  Set *MAX_VF according to
 560    the maximum vectorization factor the data dependences allow.  */
 561
 562 bool
 563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 564                                    unsigned int *max_vf)
 565 {
 566   unsigned int i;
 567   struct data_dependence_relation *ddr;
 568
 569   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 570
 571   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 572     {
 573       LOOP_VINFO_DDRS (loop_vinfo)
 574         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 575                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 576       /* We need read-read dependences to compute
 577          STMT_VINFO_SAME_ALIGN_REFS.  */
 578       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 579                                           &LOOP_VINFO_DDRS (loop_vinfo),
 580                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 581                                           true);
 582       gcc_assert (res);
 583     }
 584
 585   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 586
 587   /* For epilogues we either have no aliases or alias versioning
 588      was applied to original loop.  Therefore we may just get max_vf
 589      using VF of original loop.  */
 590   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 591     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 592   else
 593     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 594       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 595         return false;
 596
 597   return true;
 598 }
 599
 600
 601 /* Function vect_slp_analyze_data_ref_dependence.
 602
 603    Return TRUE if there (might) exist a dependence between a memory-reference
 604    DRA and a memory-reference DRB.  When versioning for alias may check a
 605    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 606    the data dependence.  */
 607
 608 static bool
 609 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 610 {
 611   struct data_reference *dra = DDR_A (ddr);
 612   struct data_reference *drb = DDR_B (ddr);
 613
 614   /* We need to check dependences of statements marked as unvectorizable
 615      as well, they still can prohibit vectorization.  */
 616
 617   /* Independent data accesses.  */
 618   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 619     return false;
 620
 621   if (dra == drb)
 622     return false;
 623
 624   /* Read-read is OK.  */
 625   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 626     return false;
 627
 628   /* If dra and drb are part of the same interleaving chain consider
 629      them independent.  */
 630   if (STMT_VINFO_GROUPED_ACCESS (vect_dr_stmt (dra))
 631       && (DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dra))
 632           == DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (drb))))
 633     return false;
 634
 635   /* Unknown data dependence.  */
 636   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 637     {
 638       if  (dump_enabled_p ())
 639         {
 640           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                            "can't determine dependence between ");
 642           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 643           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 644           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 645           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 646         }
 647     }
 648   else if (dump_enabled_p ())
 649     {
 650       dump_printf_loc (MSG_NOTE, vect_location,
 651                        "determined dependence between ");
 652       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 653       dump_printf (MSG_NOTE, " and ");
 654       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 655       dump_printf (MSG_NOTE,  "\n");
 656     }
 657
 658   return true;
 659 }
 660
 661
 662 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 663    contain the vector of scalar stores of this instance if we are
 664    disambiguating the loads.  */
 665
 666 static bool
 667 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 668                                    vec<stmt_vec_info> stores,
 669                                    gimple *last_store)
 670 {
 671   /* This walks over all stmts involved in the SLP load/store done
 672      in NODE verifying we can sink them up to the last stmt in the
 673      group.  */
 674   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 675   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 676     {
 677       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 678       if (access_info == last_access)
 679         continue;
 680       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 681       ao_ref ref;
 682       bool ref_initialized_p = false;
 683       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 684            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 685         {
 686           gimple *stmt = gsi_stmt (gsi);
 687           if (! gimple_vuse (stmt)
 688               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 689             continue;
 690
 691           /* If we couldn't record a (single) data reference for this
 692              stmt we have to resort to the alias oracle.  */
 693           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 694           if (!dr_b)
 695             {
 696               /* We are moving a store or sinking a load - this means
 697                  we cannot use TBAA for disambiguation.  */
 698               if (!ref_initialized_p)
 699                 ao_ref_init (&ref, DR_REF (dr_a));
 700               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 701                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 702                 return false;
 703               continue;
 704             }
 705
 706           bool dependent = false;
 707           /* If we run into a store of this same instance (we've just
 708              marked those) then delay dependence checking until we run
 709              into the last store because this is where it will have
 710              been sunk to (and we verify if we can do that as well).  */
 711           if (gimple_visited_p (stmt))
 712             {
 713               if (stmt != last_store)
 714                 continue;
 715               unsigned i;
 716               stmt_vec_info store_info;
 717               FOR_EACH_VEC_ELT (stores, i, store_info)
 718                 {
 719                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 720                   ddr_p ddr = initialize_data_dependence_relation
 721                                 (dr_a, store_dr, vNULL);
 722                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 723                   free_dependence_relation (ddr);
 724                   if (dependent)
 725                     break;
 726                 }
 727             }
 728           else
 729             {
 730               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 731                                                                dr_b, vNULL);
 732               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 733               free_dependence_relation (ddr);
 734             }
 735           if (dependent)
 736             return false;
 737         }
 738     }
 739   return true;
 740 }
 741
 742
 743 /* Function vect_analyze_data_ref_dependences.
 744
 745    Examine all the data references in the basic-block, and make sure there
 746    do not exist any data dependences between them.  Set *MAX_VF according to
 747    the maximum vectorization factor the data dependences allow.  */
 748
 749 bool
 750 vect_slp_analyze_instance_dependence (slp_instance instance)
 751 {
 752   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 753
 754   /* The stores of this instance are at the root of the SLP tree.  */
 755   slp_tree store = SLP_INSTANCE_TREE (instance);
 756   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 757     store = NULL;
 758
 759   /* Verify we can sink stores to the vectorized stmt insert location.  */
 760   gimple *last_store = NULL;
 761   if (store)
 762     {
 763       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 764         return false;
 765
 766       /* Mark stores in this instance and remember the last one.  */
 767       last_store = vect_find_last_scalar_stmt_in_slp (store);
 768       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 769         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 770     }
 771
 772   bool res = true;
 773
 774   /* Verify we can sink loads to the vectorized stmt insert location,
 775      special-casing stores of this instance.  */
 776   slp_tree load;
 777   unsigned int i;
 778   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 779     if (! vect_slp_analyze_node_dependences (instance, load,
 780                                              store
 781                                              ? SLP_TREE_SCALAR_STMTS (store)
 782                                              : vNULL, last_store))
 783       {
 784         res = false;
 785         break;
 786       }
 787
 788   /* Unset the visited flag.  */
 789   if (store)
 790     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 791       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 792
 793   return res;
 794 }
 795
 796 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 797    the statement that contains DRB, which is useful for recording in the
 798    dump file.  */
 799
 800 static void
 801 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 802                             innermost_loop_behavior *drb)
 803 {
 804   bool existed;
 805   innermost_loop_behavior *&entry
 806     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 807   if (!existed || entry->base_alignment < drb->base_alignment)
 808     {
 809       entry = drb;
 810       if (dump_enabled_p ())
 811         {
 812           dump_printf_loc (MSG_NOTE, vect_location,
 813                            "recording new base alignment for ");
 814           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 815           dump_printf (MSG_NOTE, "\n");
 816           dump_printf_loc (MSG_NOTE, vect_location,
 817                            "  alignment:    %d\n", drb->base_alignment);
 818           dump_printf_loc (MSG_NOTE, vect_location,
 819                            "  misalignment: %d\n", drb->base_misalignment);
 820           dump_printf_loc (MSG_NOTE, vect_location,
 821                            "  based on:     ");
 822           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 823         }
 824     }
 825 }
 826
 827 /* If the region we're going to vectorize is reached, all unconditional
 828    data references occur at least once.  We can therefore pool the base
 829    alignment guarantees from each unconditional reference.  Do this by
 830    going through all the data references in VINFO and checking whether
 831    the containing statement makes the reference unconditionally.  If so,
 832    record the alignment of the base address in VINFO so that it can be
 833    used for all other references with the same base.  */
 834
 835 void
 836 vect_record_base_alignments (vec_info *vinfo)
 837 {
 838   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 839   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 840   data_reference *dr;
 841   unsigned int i;
 842   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 843     {
 844       stmt_vec_info stmt_info = vect_dr_stmt (dr);
 845       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 846           && STMT_VINFO_VECTORIZABLE (stmt_info)
 847           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 848         {
 849           vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
 850
 851           /* If DR is nested in the loop that is being vectorized, we can also
 852              record the alignment of the base wrt the outer loop.  */
 853           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 854             vect_record_base_alignment
 855                 (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 856         }
 857     }
 858 }
 859
 860 /* Return the target alignment for the vectorized form of DR.  */
 861
 862 static unsigned int
 863 vect_calculate_target_alignment (struct data_reference *dr)
 864 {
 865   stmt_vec_info stmt_info = vect_dr_stmt (dr);
 866   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 867   return targetm.vectorize.preferred_vector_alignment (vectype);
 868 }
 869
 870 /* Function vect_compute_data_ref_alignment
 871
 872    Compute the misalignment of the data reference DR.
 873
 874    Output:
 875    1. DR_MISALIGNMENT (DR) is defined.
 876
 877    FOR NOW: No analysis is actually performed. Misalignment is calculated
 878    only for trivial cases. TODO.  */
 879
 880 static void
 881 vect_compute_data_ref_alignment (struct data_reference *dr)
 882 {
 883   stmt_vec_info stmt_info = vect_dr_stmt (dr);
 884   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 885   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 886   struct loop *loop = NULL;
 887   tree ref = DR_REF (dr);
 888   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 889
 890   if (dump_enabled_p ())
 891     dump_printf_loc (MSG_NOTE, vect_location,
 892                      "vect_compute_data_ref_alignment:\n");
 893
 894   if (loop_vinfo)
 895     loop = LOOP_VINFO_LOOP (loop_vinfo);
 896
 897   /* Initialize misalignment to unknown.  */
 898   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 899
 900   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 901     return;
 902
 903   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 904   bool step_preserves_misalignment_p;
 905
 906   unsigned HOST_WIDE_INT vector_alignment
 907     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 908   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 909
 910   /* No step for BB vectorization.  */
 911   if (!loop)
 912     {
 913       gcc_assert (integer_zerop (drb->step));
 914       step_preserves_misalignment_p = true;
 915     }
 916
 917   /* In case the dataref is in an inner-loop of the loop that is being
 918      vectorized (LOOP), we use the base and misalignment information
 919      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 920      stays the same throughout the execution of the inner-loop, which is why
 921      we have to check that the stride of the dataref in the inner-loop evenly
 922      divides by the vector alignment.  */
 923   else if (nested_in_vect_loop_p (loop, stmt_info))
 924     {
 925       step_preserves_misalignment_p
 926         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 927
 928       if (dump_enabled_p ())
 929         {
 930           if (step_preserves_misalignment_p)
 931             dump_printf_loc (MSG_NOTE, vect_location,
 932                              "inner step divides the vector alignment.\n");
 933           else
 934             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 935                              "inner step doesn't divide the vector"
 936                              " alignment.\n");
 937         }
 938     }
 939
 940   /* Similarly we can only use base and misalignment information relative to
 941      an innermost loop if the misalignment stays the same throughout the
 942      execution of the loop.  As above, this is the case if the stride of
 943      the dataref evenly divides by the alignment.  */
 944   else
 945     {
 946       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 947       step_preserves_misalignment_p
 948         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 949
 950       if (!step_preserves_misalignment_p && dump_enabled_p ())
 951         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 952                          "step doesn't divide the vector alignment.\n");
 953     }
 954
 955   unsigned int base_alignment = drb->base_alignment;
 956   unsigned int base_misalignment = drb->base_misalignment;
 957
 958   /* Calculate the maximum of the pooled base address alignment and the
 959      alignment that we can compute for DR itself.  */
 960   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 961   if (entry && base_alignment < (*entry)->base_alignment)
 962     {
 963       base_alignment = (*entry)->base_alignment;
 964       base_misalignment = (*entry)->base_misalignment;
 965     }
 966
 967   if (drb->offset_alignment < vector_alignment
 968       || !step_preserves_misalignment_p
 969       /* We need to know whether the step wrt the vectorized loop is
 970          negative when computing the starting misalignment below.  */
 971       || TREE_CODE (drb->step) != INTEGER_CST)
 972     {
 973       if (dump_enabled_p ())
 974         {
 975           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 976                            "Unknown alignment for access: ");
 977           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 978           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 979         }
 980       return;
 981     }
 982
 983   if (base_alignment < vector_alignment)
 984     {
 985       unsigned int max_alignment;
 986       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 987       if (max_alignment < vector_alignment
 988           || !vect_can_force_dr_alignment_p (base,
 989                                              vector_alignment * BITS_PER_UNIT))
 990         {
 991           if (dump_enabled_p ())
 992             {
 993               dump_printf_loc (MSG_NOTE, vect_location,
 994                                "can't force alignment of ref: ");
 995               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 996               dump_printf (MSG_NOTE, "\n");
 997             }
 998           return;
 999         }
1000
1001       /* Force the alignment of the decl.
1002          NOTE: This is the only change to the code we make during
1003          the analysis phase, before deciding to vectorize the loop.  */
1004       if (dump_enabled_p ())
1005         {
1006           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1007           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1008           dump_printf (MSG_NOTE, "\n");
1009         }
1010
1011       DR_VECT_AUX (dr)->base_decl = base;
1012       DR_VECT_AUX (dr)->base_misaligned = true;
1013       base_misalignment = 0;
1014     }
1015   poly_int64 misalignment
1016     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1017
1018   /* If this is a backward running DR then first access in the larger
1019      vectype actually is N-1 elements before the address in the DR.
1020      Adjust misalign accordingly.  */
1021   if (tree_int_cst_sgn (drb->step) < 0)
1022     /* PLUS because STEP is negative.  */
1023     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1024                      * TREE_INT_CST_LOW (drb->step));
1025
1026   unsigned int const_misalignment;
1027   if (!known_misalignment (misalignment, vector_alignment,
1028                            &const_misalignment))
1029     {
1030       if (dump_enabled_p ())
1031         {
1032           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1033                            "Non-constant misalignment for access: ");
1034           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1035           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1036         }
1037       return;
1038     }
1039
1040   SET_DR_MISALIGNMENT (dr, const_misalignment);
1041
1042   if (dump_enabled_p ())
1043     {
1044       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1045                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1046       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1047       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1048     }
1049
1050   return;
1051 }
1052
1053 /* Function vect_update_misalignment_for_peel.
1054    Sets DR's misalignment
1055    - to 0 if it has the same alignment as DR_PEEL,
1056    - to the misalignment computed using NPEEL if DR's salignment is known,
1057    - to -1 (unknown) otherwise.
1058
1059    DR - the data reference whose misalignment is to be adjusted.
1060    DR_PEEL - the data reference whose misalignment is being made
1061              zero in the vector loop by the peel.
1062    NPEEL - the number of iterations in the peel loop if the misalignment
1063            of DR_PEEL is known at compile time.  */
1064
1065 static void
1066 vect_update_misalignment_for_peel (struct data_reference *dr,
1067                                    struct data_reference *dr_peel, int npeel)
1068 {
1069   unsigned int i;
1070   vec<dr_p> same_aligned_drs;
1071   struct data_reference *current_dr;
1072   int dr_size = vect_get_scalar_dr_size (dr);
1073   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1074   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1075   stmt_vec_info peel_stmt_info = vect_dr_stmt (dr_peel);
1076
1077  /* For interleaved data accesses the step in the loop must be multiplied by
1078      the size of the interleaving group.  */
1079   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1080     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1081   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1082     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1083
1084   /* It can be assumed that the data refs with the same alignment as dr_peel
1085      are aligned in the vector loop.  */
1086   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr_peel));
1087   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1088     {
1089       if (current_dr != dr)
1090         continue;
1091       gcc_assert (!known_alignment_for_access_p (dr)
1092                   || !known_alignment_for_access_p (dr_peel)
1093                   || (DR_MISALIGNMENT (dr) / dr_size
1094                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1095       SET_DR_MISALIGNMENT (dr, 0);
1096       return;
1097     }
1098
1099   if (known_alignment_for_access_p (dr)
1100       && known_alignment_for_access_p (dr_peel))
1101     {
1102       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1103       int misal = DR_MISALIGNMENT (dr);
1104       misal += negative ? -npeel * dr_size : npeel * dr_size;
1105       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1106       SET_DR_MISALIGNMENT (dr, misal);
1107       return;
1108     }
1109
1110   if (dump_enabled_p ())
1111     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1112                      "to unknown (-1).\n");
1113   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1114 }
1115
1116
1117 /* Function verify_data_ref_alignment
1118
1119    Return TRUE if DR can be handled with respect to alignment.  */
1120
1121 static bool
1122 verify_data_ref_alignment (data_reference_p dr)
1123 {
1124   enum dr_alignment_support supportable_dr_alignment
1125     = vect_supportable_dr_alignment (dr, false);
1126   if (!supportable_dr_alignment)
1127     {
1128       if (dump_enabled_p ())
1129         {
1130           if (DR_IS_READ (dr))
1131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1132                              "not vectorized: unsupported unaligned load.");
1133           else
1134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1135                              "not vectorized: unsupported unaligned "
1136                              "store.");
1137
1138           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1139                              DR_REF (dr));
1140           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1141         }
1142       return false;
1143     }
1144
1145   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1146     dump_printf_loc (MSG_NOTE, vect_location,
1147                      "Vectorizing an unaligned access.\n");
1148
1149   return true;
1150 }
1151
1152 /* Function vect_verify_datarefs_alignment
1153
1154    Return TRUE if all data references in the loop can be
1155    handled with respect to alignment.  */
1156
1157 bool
1158 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1159 {
1160   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1161   struct data_reference *dr;
1162   unsigned int i;
1163
1164   FOR_EACH_VEC_ELT (datarefs, i, dr)
1165     {
1166       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1167
1168       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1169         continue;
1170
1171       /* For interleaving, only the alignment of the first access matters.   */
1172       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1173           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1174         continue;
1175
1176       /* Strided accesses perform only component accesses, alignment is
1177          irrelevant for them.  */
1178       if (STMT_VINFO_STRIDED_P (stmt_info)
1179           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1180         continue;
1181
1182       if (! verify_data_ref_alignment (dr))
1183         return false;
1184     }
1185
1186   return true;
1187 }
1188
1189 /* Given an memory reference EXP return whether its alignment is less
1190    than its size.  */
1191
1192 static bool
1193 not_size_aligned (tree exp)
1194 {
1195   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1196     return true;
1197
1198   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1199           > get_object_alignment (exp));
1200 }
1201
1202 /* Function vector_alignment_reachable_p
1203
1204    Return true if vector alignment for DR is reachable by peeling
1205    a few loop iterations.  Return false otherwise.  */
1206
1207 static bool
1208 vector_alignment_reachable_p (struct data_reference *dr)
1209 {
1210   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1211   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1212
1213   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1214     {
1215       /* For interleaved access we peel only if number of iterations in
1216          the prolog loop ({VF - misalignment}), is a multiple of the
1217          number of the interleaved accesses.  */
1218       int elem_size, mis_in_elements;
1219
1220       /* FORNOW: handle only known alignment.  */
1221       if (!known_alignment_for_access_p (dr))
1222         return false;
1223
1224       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226       elem_size = vector_element_size (vector_size, nelements);
1227       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1228
1229       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230         return false;
1231     }
1232
1233   /* If misalignment is known at the compile time then allow peeling
1234      only if natural alignment is reachable through peeling.  */
1235   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1236     {
1237       HOST_WIDE_INT elmsize =
1238                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239       if (dump_enabled_p ())
1240         {
1241           dump_printf_loc (MSG_NOTE, vect_location,
1242                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243           dump_printf (MSG_NOTE,
1244                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1245         }
1246       if (DR_MISALIGNMENT (dr) % elmsize)
1247         {
1248           if (dump_enabled_p ())
1249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                              "data size does not divide the misalignment.\n");
1251           return false;
1252         }
1253     }
1254
1255   if (!known_alignment_for_access_p (dr))
1256     {
1257       tree type = TREE_TYPE (DR_REF (dr));
1258       bool is_packed = not_size_aligned (DR_REF (dr));
1259       if (dump_enabled_p ())
1260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261                          "Unknown misalignment, %snaturally aligned\n",
1262                          is_packed ? "not " : "");
1263       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1264     }
1265
1266   return true;
1267 }
1268
1269
1270 /* Calculate the cost of the memory access represented by DR.  */
1271
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274                            unsigned int *inside_cost,
1275                            unsigned int *outside_cost,
1276                            stmt_vector_for_cost *body_cost_vec,
1277                            stmt_vector_for_cost *prologue_cost_vec)
1278 {
1279   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1280   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1281   int ncopies;
1282
1283   if (PURE_SLP_STMT (stmt_info))
1284     ncopies = 1;
1285   else
1286     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1287
1288   if (DR_IS_READ (dr))
1289     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1290                         prologue_cost_vec, body_cost_vec, false);
1291   else
1292     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1293
1294   if (dump_enabled_p ())
1295     dump_printf_loc (MSG_NOTE, vect_location,
1296                      "vect_get_data_access_cost: inside_cost = %d, "
1297                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1298 }
1299
1300
1301 typedef struct _vect_peel_info
1302 {
1303   struct data_reference *dr;
1304   int npeel;
1305   unsigned int count;
1306 } *vect_peel_info;
1307
1308 typedef struct _vect_peel_extended_info
1309 {
1310   struct _vect_peel_info peel_info;
1311   unsigned int inside_cost;
1312   unsigned int outside_cost;
1313 } *vect_peel_extended_info;
1314
1315
1316 /* Peeling hashtable helpers.  */
1317
1318 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1319 {
1320   static inline hashval_t hash (const _vect_peel_info *);
1321   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1322 };
1323
1324 inline hashval_t
1325 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1326 {
1327   return (hashval_t) peel_info->npeel;
1328 }
1329
1330 inline bool
1331 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1332 {
1333   return (a->npeel == b->npeel);
1334 }
1335
1336
1337 /* Insert DR into peeling hash table with NPEEL as key.  */
1338
1339 static void
1340 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1341                           loop_vec_info loop_vinfo, struct data_reference *dr,
1342                           int npeel)
1343 {
1344   struct _vect_peel_info elem, *slot;
1345   _vect_peel_info **new_slot;
1346   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1347
1348   elem.npeel = npeel;
1349   slot = peeling_htab->find (&elem);
1350   if (slot)
1351     slot->count++;
1352   else
1353     {
1354       slot = XNEW (struct _vect_peel_info);
1355       slot->npeel = npeel;
1356       slot->dr = dr;
1357       slot->count = 1;
1358       new_slot = peeling_htab->find_slot (slot, INSERT);
1359       *new_slot = slot;
1360     }
1361
1362   if (!supportable_dr_alignment
1363       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1364     slot->count += VECT_MAX_COST;
1365 }
1366
1367
1368 /* Traverse peeling hash table to find peeling option that aligns maximum
1369    number of data accesses.  */
1370
1371 int
1372 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1373                                      _vect_peel_extended_info *max)
1374 {
1375   vect_peel_info elem = *slot;
1376
1377   if (elem->count > max->peel_info.count
1378       || (elem->count == max->peel_info.count
1379           && max->peel_info.npeel > elem->npeel))
1380     {
1381       max->peel_info.npeel = elem->npeel;
1382       max->peel_info.count = elem->count;
1383       max->peel_info.dr = elem->dr;
1384     }
1385
1386   return 1;
1387 }
1388
1389 /* Get the costs of peeling NPEEL iterations checking data access costs
1390    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1391    misalignment will be zero after peeling.  */
1392
1393 static void
1394 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1395                                 struct data_reference *dr0,
1396                                 unsigned int *inside_cost,
1397                                 unsigned int *outside_cost,
1398                                 stmt_vector_for_cost *body_cost_vec,
1399                                 stmt_vector_for_cost *prologue_cost_vec,
1400                                 unsigned int npeel,
1401                                 bool unknown_misalignment)
1402 {
1403   unsigned i;
1404   data_reference *dr;
1405
1406   FOR_EACH_VEC_ELT (datarefs, i, dr)
1407     {
1408       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1409       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410         continue;
1411
1412       /* For interleaving, only the alignment of the first access
1413          matters.  */
1414       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416         continue;
1417
1418       /* Strided accesses perform only component accesses, alignment is
1419          irrelevant for them.  */
1420       if (STMT_VINFO_STRIDED_P (stmt_info)
1421           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422         continue;
1423
1424       int save_misalignment;
1425       save_misalignment = DR_MISALIGNMENT (dr);
1426       if (npeel == 0)
1427         ;
1428       else if (unknown_misalignment && dr == dr0)
1429         SET_DR_MISALIGNMENT (dr, 0);
1430       else
1431         vect_update_misalignment_for_peel (dr, dr0, npeel);
1432       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1433                                  body_cost_vec, prologue_cost_vec);
1434       SET_DR_MISALIGNMENT (dr, save_misalignment);
1435     }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439    Find the one with the lowest cost.  */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443                                    _vect_peel_extended_info *min)
1444 {
1445   vect_peel_info elem = *slot;
1446   int dummy;
1447   unsigned int inside_cost = 0, outside_cost = 0;
1448   stmt_vec_info stmt_info = vect_dr_stmt (elem->dr);
1449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451                        epilogue_cost_vec;
1452
1453   prologue_cost_vec.create (2);
1454   body_cost_vec.create (2);
1455   epilogue_cost_vec.create (2);
1456
1457   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1458                                   elem->dr, &inside_cost, &outside_cost,
1459                                   &body_cost_vec, &prologue_cost_vec,
1460                                   elem->npeel, false);
1461
1462   body_cost_vec.release ();
1463
1464   outside_cost += vect_get_known_peeling_cost
1465     (loop_vinfo, elem->npeel, &dummy,
1466      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1467      &prologue_cost_vec, &epilogue_cost_vec);
1468
1469   /* Prologue and epilogue costs are added to the target model later.
1470      These costs depend only on the scalar iteration cost, the
1471      number of peeling iterations finally chosen, and the number of
1472      misaligned statements.  So discard the information found here.  */
1473   prologue_cost_vec.release ();
1474   epilogue_cost_vec.release ();
1475
1476   if (inside_cost < min->inside_cost
1477       || (inside_cost == min->inside_cost
1478           && outside_cost < min->outside_cost))
1479     {
1480       min->inside_cost = inside_cost;
1481       min->outside_cost = outside_cost;
1482       min->peel_info.dr = elem->dr;
1483       min->peel_info.npeel = elem->npeel;
1484       min->peel_info.count = elem->count;
1485     }
1486
1487   return 1;
1488 }
1489
1490
1491 /* Choose best peeling option by traversing peeling hash table and either
1492    choosing an option with the lowest cost (if cost model is enabled) or the
1493    option that aligns as many accesses as possible.  */
1494
1495 static struct _vect_peel_extended_info
1496 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1497                                        loop_vec_info loop_vinfo)
1498 {
1499    struct _vect_peel_extended_info res;
1500
1501    res.peel_info.dr = NULL;
1502
1503    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1504      {
1505        res.inside_cost = INT_MAX;
1506        res.outside_cost = INT_MAX;
1507        peeling_htab->traverse <_vect_peel_extended_info *,
1508                                vect_peeling_hash_get_lowest_cost> (&res);
1509      }
1510    else
1511      {
1512        res.peel_info.count = 0;
1513        peeling_htab->traverse <_vect_peel_extended_info *,
1514                                vect_peeling_hash_get_most_frequent> (&res);
1515        res.inside_cost = 0;
1516        res.outside_cost = 0;
1517      }
1518
1519    return res;
1520 }
1521
1522 /* Return true if the new peeling NPEEL is supported.  */
1523
1524 static bool
1525 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1526                           unsigned npeel)
1527 {
1528   unsigned i;
1529   struct data_reference *dr = NULL;
1530   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1531   enum dr_alignment_support supportable_dr_alignment;
1532
1533   /* Ensure that all data refs can be vectorized after the peel.  */
1534   FOR_EACH_VEC_ELT (datarefs, i, dr)
1535     {
1536       int save_misalignment;
1537
1538       if (dr == dr0)
1539         continue;
1540
1541       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1542       /* For interleaving, only the alignment of the first access
1543          matters.  */
1544       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546         continue;
1547
1548       /* Strided accesses perform only component accesses, alignment is
1549          irrelevant for them.  */
1550       if (STMT_VINFO_STRIDED_P (stmt_info)
1551           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552         continue;
1553
1554       save_misalignment = DR_MISALIGNMENT (dr);
1555       vect_update_misalignment_for_peel (dr, dr0, npeel);
1556       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1557       SET_DR_MISALIGNMENT (dr, save_misalignment);
1558
1559       if (!supportable_dr_alignment)
1560         return false;
1561     }
1562
1563   return true;
1564 }
1565
1566 /* Function vect_enhance_data_refs_alignment
1567
1568    This pass will use loop versioning and loop peeling in order to enhance
1569    the alignment of data references in the loop.
1570
1571    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1572    original loop is to be vectorized.  Any other loops that are created by
1573    the transformations performed in this pass - are not supposed to be
1574    vectorized.  This restriction will be relaxed.
1575
1576    This pass will require a cost model to guide it whether to apply peeling
1577    or versioning or a combination of the two.  For example, the scheme that
1578    intel uses when given a loop with several memory accesses, is as follows:
1579    choose one memory access ('p') which alignment you want to force by doing
1580    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1581    other accesses are not necessarily aligned, or (2) use loop versioning to
1582    generate one loop in which all accesses are aligned, and another loop in
1583    which only 'p' is necessarily aligned.
1584
1585    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1586    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1587    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1588
1589    Devising a cost model is the most critical aspect of this work.  It will
1590    guide us on which access to peel for, whether to use loop versioning, how
1591    many versions to create, etc.  The cost model will probably consist of
1592    generic considerations as well as target specific considerations (on
1593    powerpc for example, misaligned stores are more painful than misaligned
1594    loads).
1595
1596    Here are the general steps involved in alignment enhancements:
1597
1598      -- original loop, before alignment analysis:
1599         for (i=0; i<N; i++){
1600           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1601           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1602         }
1603
1604      -- After vect_compute_data_refs_alignment:
1605         for (i=0; i<N; i++){
1606           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1607           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1608         }
1609
1610      -- Possibility 1: we do loop versioning:
1611      if (p is aligned) {
1612         for (i=0; i<N; i++){    # loop 1A
1613           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1614           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1615         }
1616      }
1617      else {
1618         for (i=0; i<N; i++){    # loop 1B
1619           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1620           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1621         }
1622      }
1623
1624      -- Possibility 2: we do loop peeling:
1625      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1626         x = q[i];
1627         p[i] = y;
1628      }
1629      for (i = 3; i < N; i++){   # loop 2A
1630         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1631         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1632      }
1633
1634      -- Possibility 3: combination of loop peeling and versioning:
1635      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1636         x = q[i];
1637         p[i] = y;
1638      }
1639      if (p is aligned) {
1640         for (i = 3; i<N; i++){  # loop 3A
1641           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1642           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1643         }
1644      }
1645      else {
1646         for (i = 3; i<N; i++){  # loop 3B
1647           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1648           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1649         }
1650      }
1651
1652      These loops are later passed to loop_transform to be vectorized.  The
1653      vectorizer will use the alignment information to guide the transformation
1654      (whether to generate regular loads/stores, or with special handling for
1655      misalignment).  */
1656
1657 bool
1658 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1659 {
1660   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1661   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1662   enum dr_alignment_support supportable_dr_alignment;
1663   struct data_reference *dr0 = NULL, *first_store = NULL;
1664   struct data_reference *dr;
1665   unsigned int i, j;
1666   bool do_peeling = false;
1667   bool do_versioning = false;
1668   bool stat;
1669   unsigned int npeel = 0;
1670   bool one_misalignment_known = false;
1671   bool one_misalignment_unknown = false;
1672   bool one_dr_unsupportable = false;
1673   struct data_reference *unsupportable_dr = NULL;
1674   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1675   unsigned possible_npeel_number = 1;
1676   tree vectype;
1677   unsigned int mis, same_align_drs_max = 0;
1678   hash_table<peel_info_hasher> peeling_htab (1);
1679
1680   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1681
1682   /* Reset data so we can safely be called multiple times.  */
1683   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1684   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1685
1686   /* While cost model enhancements are expected in the future, the high level
1687      view of the code at this time is as follows:
1688
1689      A) If there is a misaligned access then see if peeling to align
1690         this access can make all data references satisfy
1691         vect_supportable_dr_alignment.  If so, update data structures
1692         as needed and return true.
1693
1694      B) If peeling wasn't possible and there is a data reference with an
1695         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1696         then see if loop versioning checks can be used to make all data
1697         references satisfy vect_supportable_dr_alignment.  If so, update
1698         data structures as needed and return true.
1699
1700      C) If neither peeling nor versioning were successful then return false if
1701         any data reference does not satisfy vect_supportable_dr_alignment.
1702
1703      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1704
1705      Note, Possibility 3 above (which is peeling and versioning together) is not
1706      being done at this time.  */
1707
1708   /* (1) Peeling to force alignment.  */
1709
1710   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1711      Considerations:
1712      + How many accesses will become aligned due to the peeling
1713      - How many accesses will become unaligned due to the peeling,
1714        and the cost of misaligned accesses.
1715      - The cost of peeling (the extra runtime checks, the increase
1716        in code size).  */
1717
1718   FOR_EACH_VEC_ELT (datarefs, i, dr)
1719     {
1720       stmt_vec_info stmt_info = vect_dr_stmt (dr);
1721
1722       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1723         continue;
1724
1725       /* For interleaving, only the alignment of the first access
1726          matters.  */
1727       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1728           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1729         continue;
1730
1731       /* For scatter-gather or invariant accesses there is nothing
1732          to enhance.  */
1733       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1734           || integer_zerop (DR_STEP (dr)))
1735         continue;
1736
1737       /* Strided accesses perform only component accesses, alignment is
1738          irrelevant for them.  */
1739       if (STMT_VINFO_STRIDED_P (stmt_info)
1740           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1741         continue;
1742
1743       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1744       do_peeling = vector_alignment_reachable_p (dr);
1745       if (do_peeling)
1746         {
1747           if (known_alignment_for_access_p (dr))
1748             {
1749               unsigned int npeel_tmp = 0;
1750               bool negative = tree_int_cst_compare (DR_STEP (dr),
1751                                                     size_zero_node) < 0;
1752
1753               vectype = STMT_VINFO_VECTYPE (stmt_info);
1754               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1755               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1756               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1757               if (DR_MISALIGNMENT (dr) != 0)
1758                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1759
1760               /* For multiple types, it is possible that the bigger type access
1761                  will have more than one peeling option.  E.g., a loop with two
1762                  types: one of size (vector size / 4), and the other one of
1763                  size (vector size / 8).  Vectorization factor will 8.  If both
1764                  accesses are misaligned by 3, the first one needs one scalar
1765                  iteration to be aligned, and the second one needs 5.  But the
1766                  first one will be aligned also by peeling 5 scalar
1767                  iterations, and in that case both accesses will be aligned.
1768                  Hence, except for the immediate peeling amount, we also want
1769                  to try to add full vector size, while we don't exceed
1770                  vectorization factor.
1771                  We do this automatically for cost model, since we calculate
1772                  cost for every peeling option.  */
1773               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1774                 {
1775                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1776                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1777                   possible_npeel_number
1778                     = vect_get_num_vectors (nscalars, vectype);
1779
1780                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1781                      allow peeling NELEMENTS.  */
1782                   if (DR_MISALIGNMENT (dr) == 0)
1783                     possible_npeel_number++;
1784                 }
1785
1786               /* Save info about DR in the hash table.  Also include peeling
1787                  amounts according to the explanation above.  */
1788               for (j = 0; j < possible_npeel_number; j++)
1789                 {
1790                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1791                                             dr, npeel_tmp);
1792                   npeel_tmp += target_align / dr_size;
1793                 }
1794
1795               one_misalignment_known = true;
1796             }
1797           else
1798             {
1799               /* If we don't know any misalignment values, we prefer
1800                  peeling for data-ref that has the maximum number of data-refs
1801                  with the same alignment, unless the target prefers to align
1802                  stores over load.  */
1803               unsigned same_align_drs
1804                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1805               if (!dr0
1806                   || same_align_drs_max < same_align_drs)
1807                 {
1808                   same_align_drs_max = same_align_drs;
1809                   dr0 = dr;
1810                 }
1811               /* For data-refs with the same number of related
1812                  accesses prefer the one where the misalign
1813                  computation will be invariant in the outermost loop.  */
1814               else if (same_align_drs_max == same_align_drs)
1815                 {
1816                   struct loop *ivloop0, *ivloop;
1817                   ivloop0 = outermost_invariant_loop_for_expr
1818                     (loop, DR_BASE_ADDRESS (dr0));
1819                   ivloop = outermost_invariant_loop_for_expr
1820                     (loop, DR_BASE_ADDRESS (dr));
1821                   if ((ivloop && !ivloop0)
1822                       || (ivloop && ivloop0
1823                           && flow_loop_nested_p (ivloop, ivloop0)))
1824                     dr0 = dr;
1825                 }
1826
1827               one_misalignment_unknown = true;
1828
1829               /* Check for data refs with unsupportable alignment that
1830                  can be peeled.  */
1831               if (!supportable_dr_alignment)
1832               {
1833                 one_dr_unsupportable = true;
1834                 unsupportable_dr = dr;
1835               }
1836
1837               if (!first_store && DR_IS_WRITE (dr))
1838                 first_store = dr;
1839             }
1840         }
1841       else
1842         {
1843           if (!aligned_access_p (dr))
1844             {
1845               if (dump_enabled_p ())
1846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1847                                  "vector alignment may not be reachable\n");
1848               break;
1849             }
1850         }
1851     }
1852
1853   /* Check if we can possibly peel the loop.  */
1854   if (!vect_can_advance_ivs_p (loop_vinfo)
1855       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1856       || loop->inner)
1857     do_peeling = false;
1858
1859   struct _vect_peel_extended_info peel_for_known_alignment;
1860   struct _vect_peel_extended_info peel_for_unknown_alignment;
1861   struct _vect_peel_extended_info best_peel;
1862
1863   peel_for_unknown_alignment.inside_cost = INT_MAX;
1864   peel_for_unknown_alignment.outside_cost = INT_MAX;
1865   peel_for_unknown_alignment.peel_info.count = 0;
1866
1867   if (do_peeling
1868       && one_misalignment_unknown)
1869     {
1870       /* Check if the target requires to prefer stores over loads, i.e., if
1871          misaligned stores are more expensive than misaligned loads (taking
1872          drs with same alignment into account).  */
1873       unsigned int load_inside_cost = 0;
1874       unsigned int load_outside_cost = 0;
1875       unsigned int store_inside_cost = 0;
1876       unsigned int store_outside_cost = 0;
1877       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1878
1879       stmt_vector_for_cost dummy;
1880       dummy.create (2);
1881       vect_get_peeling_costs_all_drs (datarefs, dr0,
1882                                       &load_inside_cost,
1883                                       &load_outside_cost,
1884                                       &dummy, &dummy, estimated_npeels, true);
1885       dummy.release ();
1886
1887       if (first_store)
1888         {
1889           dummy.create (2);
1890           vect_get_peeling_costs_all_drs (datarefs, first_store,
1891                                           &store_inside_cost,
1892                                           &store_outside_cost,
1893                                           &dummy, &dummy,
1894                                           estimated_npeels, true);
1895           dummy.release ();
1896         }
1897       else
1898         {
1899           store_inside_cost = INT_MAX;
1900           store_outside_cost = INT_MAX;
1901         }
1902
1903       if (load_inside_cost > store_inside_cost
1904           || (load_inside_cost == store_inside_cost
1905               && load_outside_cost > store_outside_cost))
1906         {
1907           dr0 = first_store;
1908           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1909           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1910         }
1911       else
1912         {
1913           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1914           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1915         }
1916
1917       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1918       prologue_cost_vec.create (2);
1919       epilogue_cost_vec.create (2);
1920
1921       int dummy2;
1922       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1923         (loop_vinfo, estimated_npeels, &dummy2,
1924          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1925          &prologue_cost_vec, &epilogue_cost_vec);
1926
1927       prologue_cost_vec.release ();
1928       epilogue_cost_vec.release ();
1929
1930       peel_for_unknown_alignment.peel_info.count = 1
1931         + STMT_VINFO_SAME_ALIGN_REFS (vect_dr_stmt (dr0)).length ();
1932     }
1933
1934   peel_for_unknown_alignment.peel_info.npeel = 0;
1935   peel_for_unknown_alignment.peel_info.dr = dr0;
1936
1937   best_peel = peel_for_unknown_alignment;
1938
1939   peel_for_known_alignment.inside_cost = INT_MAX;
1940   peel_for_known_alignment.outside_cost = INT_MAX;
1941   peel_for_known_alignment.peel_info.count = 0;
1942   peel_for_known_alignment.peel_info.dr = NULL;
1943
1944   if (do_peeling && one_misalignment_known)
1945     {
1946       /* Peeling is possible, but there is no data access that is not supported
1947          unless aligned.  So we try to choose the best possible peeling from
1948          the hash table.  */
1949       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1950         (&peeling_htab, loop_vinfo);
1951     }
1952
1953   /* Compare costs of peeling for known and unknown alignment. */
1954   if (peel_for_known_alignment.peel_info.dr != NULL
1955       && peel_for_unknown_alignment.inside_cost
1956       >= peel_for_known_alignment.inside_cost)
1957     {
1958       best_peel = peel_for_known_alignment;
1959
1960       /* If the best peeling for known alignment has NPEEL == 0, perform no
1961          peeling at all except if there is an unsupportable dr that we can
1962          align.  */
1963       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1964         do_peeling = false;
1965     }
1966
1967   /* If there is an unsupportable data ref, prefer this over all choices so far
1968      since we'd have to discard a chosen peeling except when it accidentally
1969      aligned the unsupportable data ref.  */
1970   if (one_dr_unsupportable)
1971     dr0 = unsupportable_dr;
1972   else if (do_peeling)
1973     {
1974       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1975          TODO: Use nopeel_outside_cost or get rid of it?  */
1976       unsigned nopeel_inside_cost = 0;
1977       unsigned nopeel_outside_cost = 0;
1978
1979       stmt_vector_for_cost dummy;
1980       dummy.create (2);
1981       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1982                                       &nopeel_outside_cost, &dummy, &dummy,
1983                                       0, false);
1984       dummy.release ();
1985
1986       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1987          costs will be recorded.  */
1988       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1989       prologue_cost_vec.create (2);
1990       epilogue_cost_vec.create (2);
1991
1992       int dummy2;
1993       nopeel_outside_cost += vect_get_known_peeling_cost
1994         (loop_vinfo, 0, &dummy2,
1995          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1996          &prologue_cost_vec, &epilogue_cost_vec);
1997
1998       prologue_cost_vec.release ();
1999       epilogue_cost_vec.release ();
2000
2001       npeel = best_peel.peel_info.npeel;
2002       dr0 = best_peel.peel_info.dr;
2003
2004       /* If no peeling is not more expensive than the best peeling we
2005          have so far, don't perform any peeling.  */
2006       if (nopeel_inside_cost <= best_peel.inside_cost)
2007         do_peeling = false;
2008     }
2009
2010   if (do_peeling)
2011     {
2012       stmt_vec_info stmt_info = vect_dr_stmt (dr0);
2013       vectype = STMT_VINFO_VECTYPE (stmt_info);
2014
2015       if (known_alignment_for_access_p (dr0))
2016         {
2017           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2018                                                 size_zero_node) < 0;
2019           if (!npeel)
2020             {
2021               /* Since it's known at compile time, compute the number of
2022                  iterations in the peeled loop (the peeling factor) for use in
2023                  updating DR_MISALIGNMENT values.  The peeling factor is the
2024                  vectorization factor minus the misalignment as an element
2025                  count.  */
2026               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2027               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2028               npeel = ((mis & (target_align - 1))
2029                        / vect_get_scalar_dr_size (dr0));
2030             }
2031
2032           /* For interleaved data access every iteration accesses all the
2033              members of the group, therefore we divide the number of iterations
2034              by the group size.  */
2035           stmt_info = vect_dr_stmt (dr0);
2036           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037             npeel /= DR_GROUP_SIZE (stmt_info);
2038
2039           if (dump_enabled_p ())
2040             dump_printf_loc (MSG_NOTE, vect_location,
2041                              "Try peeling by %d\n", npeel);
2042         }
2043
2044       /* Ensure that all datarefs can be vectorized after the peel.  */
2045       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2046         do_peeling = false;
2047
2048       /* Check if all datarefs are supportable and log.  */
2049       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2050         {
2051           stat = vect_verify_datarefs_alignment (loop_vinfo);
2052           if (!stat)
2053             do_peeling = false;
2054           else
2055             return stat;
2056         }
2057
2058       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2059       if (do_peeling)
2060         {
2061           unsigned max_allowed_peel
2062             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2063           if (max_allowed_peel != (unsigned)-1)
2064             {
2065               unsigned max_peel = npeel;
2066               if (max_peel == 0)
2067                 {
2068                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2069                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2070                 }
2071               if (max_peel > max_allowed_peel)
2072                 {
2073                   do_peeling = false;
2074                   if (dump_enabled_p ())
2075                     dump_printf_loc (MSG_NOTE, vect_location,
2076                         "Disable peeling, max peels reached: %d\n", max_peel);
2077                 }
2078             }
2079         }
2080
2081       /* Cost model #2 - if peeling may result in a remaining loop not
2082          iterating enough to be vectorized then do not peel.  Since this
2083          is a cost heuristic rather than a correctness decision, use the
2084          most likely runtime value for variable vectorization factors.  */
2085       if (do_peeling
2086           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2087         {
2088           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2089           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2090           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2091               < assumed_vf + max_peel)
2092             do_peeling = false;
2093         }
2094
2095       if (do_peeling)
2096         {
2097           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2098              If the misalignment of DR_i is identical to that of dr0 then set
2099              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2100              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2101              by the peeling factor times the element size of DR_i (MOD the
2102              vectorization factor times the size).  Otherwise, the
2103              misalignment of DR_i must be set to unknown.  */
2104           FOR_EACH_VEC_ELT (datarefs, i, dr)
2105             if (dr != dr0)
2106               {
2107                 /* Strided accesses perform only component accesses, alignment
2108                    is irrelevant for them.  */
2109                 stmt_info = vect_dr_stmt (dr);
2110                 if (STMT_VINFO_STRIDED_P (stmt_info)
2111                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2112                   continue;
2113
2114                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2115               }
2116
2117           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2118           if (npeel)
2119             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2120           else
2121             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2122               = DR_MISALIGNMENT (dr0);
2123           SET_DR_MISALIGNMENT (dr0, 0);
2124           if (dump_enabled_p ())
2125             {
2126               dump_printf_loc (MSG_NOTE, vect_location,
2127                                "Alignment of access forced using peeling.\n");
2128               dump_printf_loc (MSG_NOTE, vect_location,
2129                                "Peeling for alignment will be applied.\n");
2130             }
2131
2132           /* The inside-loop cost will be accounted for in vectorizable_load
2133              and vectorizable_store correctly with adjusted alignments.
2134              Drop the body_cst_vec on the floor here.  */
2135           stat = vect_verify_datarefs_alignment (loop_vinfo);
2136           gcc_assert (stat);
2137           return stat;
2138         }
2139     }
2140
2141   /* (2) Versioning to force alignment.  */
2142
2143   /* Try versioning if:
2144      1) optimize loop for speed
2145      2) there is at least one unsupported misaligned data ref with an unknown
2146         misalignment, and
2147      3) all misaligned data refs with a known misalignment are supported, and
2148      4) the number of runtime alignment checks is within reason.  */
2149
2150   do_versioning =
2151         optimize_loop_nest_for_speed_p (loop)
2152         && (!loop->inner); /* FORNOW */
2153
2154   if (do_versioning)
2155     {
2156       FOR_EACH_VEC_ELT (datarefs, i, dr)
2157         {
2158           stmt_vec_info stmt_info = vect_dr_stmt (dr);
2159
2160           /* For interleaving, only the alignment of the first access
2161              matters.  */
2162           if (aligned_access_p (dr)
2163               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2164                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2165             continue;
2166
2167           if (STMT_VINFO_STRIDED_P (stmt_info))
2168             {
2169               /* Strided loads perform only component accesses, alignment is
2170                  irrelevant for them.  */
2171               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2172                 continue;
2173               do_versioning = false;
2174               break;
2175             }
2176
2177           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2178
2179           if (!supportable_dr_alignment)
2180             {
2181               int mask;
2182               tree vectype;
2183
2184               if (known_alignment_for_access_p (dr)
2185                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2186                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2187                 {
2188                   do_versioning = false;
2189                   break;
2190                 }
2191
2192               stmt_info = vect_dr_stmt (dr);
2193               vectype = STMT_VINFO_VECTYPE (stmt_info);
2194               gcc_assert (vectype);
2195
2196               /* At present we don't support versioning for alignment
2197                  with variable VF, since there's no guarantee that the
2198                  VF is a power of two.  We could relax this if we added
2199                  a way of enforcing a power-of-two size.  */
2200               unsigned HOST_WIDE_INT size;
2201               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2202                 {
2203                   do_versioning = false;
2204                   break;
2205                 }
2206
2207               /* The rightmost bits of an aligned address must be zeros.
2208                  Construct the mask needed for this test.  For example,
2209                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2210                  mask must be 15 = 0xf. */
2211               mask = size - 1;
2212
2213               /* FORNOW: use the same mask to test all potentially unaligned
2214                  references in the loop.  The vectorizer currently supports
2215                  a single vector size, see the reference to
2216                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2217                  vectorization factor is computed.  */
2218               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2219                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2220               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2221               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2222             }
2223         }
2224
2225       /* Versioning requires at least one misaligned data reference.  */
2226       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2227         do_versioning = false;
2228       else if (!do_versioning)
2229         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2230     }
2231
2232   if (do_versioning)
2233     {
2234       vec<gimple *> may_misalign_stmts
2235         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2236       gimple *stmt;
2237
2238       /* It can now be assumed that the data references in the statements
2239          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2240          of the loop being vectorized.  */
2241       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2242         {
2243           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2244           dr = STMT_VINFO_DATA_REF (stmt_info);
2245           SET_DR_MISALIGNMENT (dr, 0);
2246           if (dump_enabled_p ())
2247             dump_printf_loc (MSG_NOTE, vect_location,
2248                              "Alignment of access forced using versioning.\n");
2249         }
2250
2251       if (dump_enabled_p ())
2252         dump_printf_loc (MSG_NOTE, vect_location,
2253                          "Versioning for alignment will be applied.\n");
2254
2255       /* Peeling and versioning can't be done together at this time.  */
2256       gcc_assert (! (do_peeling && do_versioning));
2257
2258       stat = vect_verify_datarefs_alignment (loop_vinfo);
2259       gcc_assert (stat);
2260       return stat;
2261     }
2262
2263   /* This point is reached if neither peeling nor versioning is being done.  */
2264   gcc_assert (! (do_peeling || do_versioning));
2265
2266   stat = vect_verify_datarefs_alignment (loop_vinfo);
2267   return stat;
2268 }
2269
2270
2271 /* Function vect_find_same_alignment_drs.
2272
2273    Update group and alignment relations according to the chosen
2274    vectorization factor.  */
2275
2276 static void
2277 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2278 {
2279   struct data_reference *dra = DDR_A (ddr);
2280   struct data_reference *drb = DDR_B (ddr);
2281   stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2282   stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2283
2284   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2285     return;
2286
2287   if (dra == drb)
2288     return;
2289
2290   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2291       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2292     return;
2293
2294   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2295       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2296       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2297     return;
2298
2299   /* Two references with distance zero have the same alignment.  */
2300   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2301                           - wi::to_poly_offset (DR_INIT (drb)));
2302   if (maybe_ne (diff, 0))
2303     {
2304       /* Get the wider of the two alignments.  */
2305       unsigned int align_a = (vect_calculate_target_alignment (dra)
2306                               / BITS_PER_UNIT);
2307       unsigned int align_b = (vect_calculate_target_alignment (drb)
2308                               / BITS_PER_UNIT);
2309       unsigned int max_align = MAX (align_a, align_b);
2310
2311       /* Require the gap to be a multiple of the larger vector alignment.  */
2312       if (!multiple_p (diff, max_align))
2313         return;
2314     }
2315
2316   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2317   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2318   if (dump_enabled_p ())
2319     {
2320       dump_printf_loc (MSG_NOTE, vect_location,
2321                        "accesses have the same alignment: ");
2322       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2323       dump_printf (MSG_NOTE,  " and ");
2324       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2325       dump_printf (MSG_NOTE, "\n");
2326     }
2327 }
2328
2329
2330 /* Function vect_analyze_data_refs_alignment
2331
2332    Analyze the alignment of the data-references in the loop.
2333    Return FALSE if a data reference is found that cannot be vectorized.  */
2334
2335 bool
2336 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2337 {
2338   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2339
2340   /* Mark groups of data references with same alignment using
2341      data dependence information.  */
2342   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2343   struct data_dependence_relation *ddr;
2344   unsigned int i;
2345
2346   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2347     vect_find_same_alignment_drs (ddr);
2348
2349   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2350   struct data_reference *dr;
2351
2352   vect_record_base_alignments (vinfo);
2353   FOR_EACH_VEC_ELT (datarefs, i, dr)
2354     {
2355       stmt_vec_info stmt_info = vect_dr_stmt (dr);
2356       if (STMT_VINFO_VECTORIZABLE (stmt_info))
2357         vect_compute_data_ref_alignment (dr);
2358     }
2359
2360   return true;
2361 }
2362
2363
2364 /* Analyze alignment of DRs of stmts in NODE.  */
2365
2366 static bool
2367 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2368 {
2369   /* We vectorize from the first scalar stmt in the node unless
2370      the node is permuted in which case we start from the first
2371      element in the group.  */
2372   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2373   gimple *first_stmt = first_stmt_info->stmt;
2374   data_reference_p first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
2375   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2376     first_stmt = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2377
2378   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2379   vect_compute_data_ref_alignment (dr);
2380   /* For creating the data-ref pointer we need alignment of the
2381      first element anyway.  */
2382   if (dr != first_dr)
2383     vect_compute_data_ref_alignment (first_dr);
2384   if (! verify_data_ref_alignment (dr))
2385     {
2386       if (dump_enabled_p ())
2387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2388                          "not vectorized: bad data alignment in basic "
2389                          "block.\n");
2390       return false;
2391     }
2392
2393   return true;
2394 }
2395
2396 /* Function vect_slp_analyze_instance_alignment
2397
2398    Analyze the alignment of the data-references in the SLP instance.
2399    Return FALSE if a data reference is found that cannot be vectorized.  */
2400
2401 bool
2402 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2403 {
2404   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2405
2406   slp_tree node;
2407   unsigned i;
2408   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2409     if (! vect_slp_analyze_and_verify_node_alignment (node))
2410       return false;
2411
2412   node = SLP_INSTANCE_TREE (instance);
2413   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2414       && ! vect_slp_analyze_and_verify_node_alignment
2415              (SLP_INSTANCE_TREE (instance)))
2416     return false;
2417
2418   return true;
2419 }
2420
2421
2422 /* Analyze groups of accesses: check that DR belongs to a group of
2423    accesses of legal size, step, etc.  Detect gaps, single element
2424    interleaving, and other special cases. Set grouped access info.
2425    Collect groups of strided stores for further use in SLP analysis.
2426    Worker for vect_analyze_group_access.  */
2427
2428 static bool
2429 vect_analyze_group_access_1 (struct data_reference *dr)
2430 {
2431   tree step = DR_STEP (dr);
2432   tree scalar_type = TREE_TYPE (DR_REF (dr));
2433   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2434   stmt_vec_info stmt_info = vect_dr_stmt (dr);
2435   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2436   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2437   HOST_WIDE_INT dr_step = -1;
2438   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2439   bool slp_impossible = false;
2440
2441   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2442      size of the interleaving group (including gaps).  */
2443   if (tree_fits_shwi_p (step))
2444     {
2445       dr_step = tree_to_shwi (step);
2446       /* Check that STEP is a multiple of type size.  Otherwise there is
2447          a non-element-sized gap at the end of the group which we
2448          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2449          ???  As we can handle non-constant step fine here we should
2450          simply remove uses of DR_GROUP_GAP between the last and first
2451          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2452          simply not include that gap.  */
2453       if ((dr_step % type_size) != 0)
2454         {
2455           if (dump_enabled_p ())
2456             {
2457               dump_printf_loc (MSG_NOTE, vect_location,
2458                                "Step ");
2459               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2460               dump_printf (MSG_NOTE,
2461                            " is not a multiple of the element size for ");
2462               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2463               dump_printf (MSG_NOTE, "\n");
2464             }
2465           return false;
2466         }
2467       groupsize = absu_hwi (dr_step) / type_size;
2468     }
2469   else
2470     groupsize = 0;
2471
2472   /* Not consecutive access is possible only if it is a part of interleaving.  */
2473   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2474     {
2475       /* Check if it this DR is a part of interleaving, and is a single
2476          element of the group that is accessed in the loop.  */
2477
2478       /* Gaps are supported only for loads. STEP must be a multiple of the type
2479          size.  */
2480       if (DR_IS_READ (dr)
2481           && (dr_step % type_size) == 0
2482           && groupsize > 0)
2483         {
2484           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2485           DR_GROUP_SIZE (stmt_info) = groupsize;
2486           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2487           if (dump_enabled_p ())
2488             {
2489               dump_printf_loc (MSG_NOTE, vect_location,
2490                                "Detected single element interleaving ");
2491               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2492               dump_printf (MSG_NOTE, " step ");
2493               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2494               dump_printf (MSG_NOTE, "\n");
2495             }
2496
2497           return true;
2498         }
2499
2500       if (dump_enabled_p ())
2501         {
2502           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2503                            "not consecutive access ");
2504           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2505                             stmt_info->stmt, 0);
2506         }
2507
2508       if (bb_vinfo)
2509         {
2510           /* Mark the statement as unvectorizable.  */
2511           STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
2512           return true;
2513         }
2514
2515       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2516       STMT_VINFO_STRIDED_P (stmt_info) = true;
2517       return true;
2518     }
2519
2520   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2521     {
2522       /* First stmt in the interleaving chain. Check the chain.  */
2523       gimple *next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2524       struct data_reference *data_ref = dr;
2525       unsigned int count = 1;
2526       tree prev_init = DR_INIT (data_ref);
2527       gimple *prev = stmt_info;
2528       HOST_WIDE_INT diff, gaps = 0;
2529
2530       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2531       while (next)
2532         {
2533           /* Skip same data-refs.  In case that two or more stmts share
2534              data-ref (supported only for loads), we vectorize only the first
2535              stmt, and the rest get their vectorized loads from the first
2536              one.  */
2537           if (!tree_int_cst_compare (DR_INIT (data_ref),
2538                                      DR_INIT (STMT_VINFO_DATA_REF (
2539                                                    vinfo_for_stmt (next)))))
2540             {
2541               if (DR_IS_WRITE (data_ref))
2542                 {
2543                   if (dump_enabled_p ())
2544                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2545                                      "Two store stmts share the same dr.\n");
2546                   return false;
2547                 }
2548
2549               if (dump_enabled_p ())
2550                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551                                  "Two or more load stmts share the same dr.\n");
2552
2553               /* For load use the same data-ref load.  */
2554               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2555
2556               prev = next;
2557               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2558               continue;
2559             }
2560
2561           prev = next;
2562           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2563
2564           /* All group members have the same STEP by construction.  */
2565           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2566
2567           /* Check that the distance between two accesses is equal to the type
2568              size. Otherwise, we have gaps.  */
2569           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2570                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2571           if (diff != 1)
2572             {
2573               /* FORNOW: SLP of accesses with gaps is not supported.  */
2574               slp_impossible = true;
2575               if (DR_IS_WRITE (data_ref))
2576                 {
2577                   if (dump_enabled_p ())
2578                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2579                                      "interleaved store with gaps\n");
2580                   return false;
2581                 }
2582
2583               gaps += diff - 1;
2584             }
2585
2586           last_accessed_element += diff;
2587
2588           /* Store the gap from the previous member of the group. If there is no
2589              gap in the access, DR_GROUP_GAP is always 1.  */
2590           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2591
2592           prev_init = DR_INIT (data_ref);
2593           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2594           /* Count the number of data-refs in the chain.  */
2595           count++;
2596         }
2597
2598       if (groupsize == 0)
2599         groupsize = count + gaps;
2600
2601       /* This could be UINT_MAX but as we are generating code in a very
2602          inefficient way we have to cap earlier.  See PR78699 for example.  */
2603       if (groupsize > 4096)
2604         {
2605           if (dump_enabled_p ())
2606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2607                              "group is too large\n");
2608           return false;
2609         }
2610
2611       /* Check that the size of the interleaving is equal to count for stores,
2612          i.e., that there are no gaps.  */
2613       if (groupsize != count
2614           && !DR_IS_READ (dr))
2615         {
2616           if (dump_enabled_p ())
2617             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2618                              "interleaved store with gaps\n");
2619           return false;
2620         }
2621
2622       /* If there is a gap after the last load in the group it is the
2623          difference between the groupsize and the last accessed
2624          element.
2625          When there is no gap, this difference should be 0.  */
2626       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2627
2628       DR_GROUP_SIZE (stmt_info) = groupsize;
2629       if (dump_enabled_p ())
2630         {
2631           dump_printf_loc (MSG_NOTE, vect_location,
2632                            "Detected interleaving ");
2633           if (DR_IS_READ (dr))
2634             dump_printf (MSG_NOTE, "load ");
2635           else
2636             dump_printf (MSG_NOTE, "store ");
2637           dump_printf (MSG_NOTE, "of size %u starting with ",
2638                        (unsigned)groupsize);
2639           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2640           if (DR_GROUP_GAP (stmt_info) != 0)
2641             dump_printf_loc (MSG_NOTE, vect_location,
2642                              "There is a gap of %u elements after the group\n",
2643                              DR_GROUP_GAP (stmt_info));
2644         }
2645
2646       /* SLP: create an SLP data structure for every interleaving group of
2647          stores for further analysis in vect_analyse_slp.  */
2648       if (DR_IS_WRITE (dr) && !slp_impossible)
2649         {
2650           if (loop_vinfo)
2651             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2652           if (bb_vinfo)
2653             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2654         }
2655     }
2656
2657   return true;
2658 }
2659
2660 /* Analyze groups of accesses: check that DR belongs to a group of
2661    accesses of legal size, step, etc.  Detect gaps, single element
2662    interleaving, and other special cases. Set grouped access info.
2663    Collect groups of strided stores for further use in SLP analysis.  */
2664
2665 static bool
2666 vect_analyze_group_access (struct data_reference *dr)
2667 {
2668   if (!vect_analyze_group_access_1 (dr))
2669     {
2670       /* Dissolve the group if present.  */
2671       gimple *next;
2672       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vect_dr_stmt (dr));
2673       while (stmt)
2674         {
2675           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2676           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2677           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2678           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2679           stmt = next;
2680         }
2681       return false;
2682     }
2683   return true;
2684 }
2685
2686 /* Analyze the access pattern of the data-reference DR.
2687    In case of non-consecutive accesses call vect_analyze_group_access() to
2688    analyze groups of accesses.  */
2689
2690 static bool
2691 vect_analyze_data_ref_access (struct data_reference *dr)
2692 {
2693   tree step = DR_STEP (dr);
2694   tree scalar_type = TREE_TYPE (DR_REF (dr));
2695   stmt_vec_info stmt_info = vect_dr_stmt (dr);
2696   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2697   struct loop *loop = NULL;
2698
2699   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2700     return true;
2701
2702   if (loop_vinfo)
2703     loop = LOOP_VINFO_LOOP (loop_vinfo);
2704
2705   if (loop_vinfo && !step)
2706     {
2707       if (dump_enabled_p ())
2708         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2709                          "bad data-ref access in loop\n");
2710       return false;
2711     }
2712
2713   /* Allow loads with zero step in inner-loop vectorization.  */
2714   if (loop_vinfo && integer_zerop (step))
2715     {
2716       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2717       if (!nested_in_vect_loop_p (loop, stmt_info))
2718         return DR_IS_READ (dr);
2719       /* Allow references with zero step for outer loops marked
2720          with pragma omp simd only - it guarantees absence of
2721          loop-carried dependencies between inner loop iterations.  */
2722       if (loop->safelen < 2)
2723         {
2724           if (dump_enabled_p ())
2725             dump_printf_loc (MSG_NOTE, vect_location,
2726                              "zero step in inner loop of nest\n");
2727           return false;
2728         }
2729     }
2730
2731   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2732     {
2733       /* Interleaved accesses are not yet supported within outer-loop
2734         vectorization for references in the inner-loop.  */
2735       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2736
2737       /* For the rest of the analysis we use the outer-loop step.  */
2738       step = STMT_VINFO_DR_STEP (stmt_info);
2739       if (integer_zerop (step))
2740         {
2741           if (dump_enabled_p ())
2742             dump_printf_loc (MSG_NOTE, vect_location,
2743                              "zero step in outer loop.\n");
2744           return DR_IS_READ (dr);
2745         }
2746     }
2747
2748   /* Consecutive?  */
2749   if (TREE_CODE (step) == INTEGER_CST)
2750     {
2751       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2752       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2753           || (dr_step < 0
2754               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2755         {
2756           /* Mark that it is not interleaving.  */
2757           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2758           return true;
2759         }
2760     }
2761
2762   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2763     {
2764       if (dump_enabled_p ())
2765         dump_printf_loc (MSG_NOTE, vect_location,
2766                          "grouped access in outer loop.\n");
2767       return false;
2768     }
2769
2770
2771   /* Assume this is a DR handled by non-constant strided load case.  */
2772   if (TREE_CODE (step) != INTEGER_CST)
2773     return (STMT_VINFO_STRIDED_P (stmt_info)
2774             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2775                 || vect_analyze_group_access (dr)));
2776
2777   /* Not consecutive access - check if it's a part of interleaving group.  */
2778   return vect_analyze_group_access (dr);
2779 }
2780
2781 /* Compare two data-references DRA and DRB to group them into chunks
2782    suitable for grouping.  */
2783
2784 static int
2785 dr_group_sort_cmp (const void *dra_, const void *drb_)
2786 {
2787   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2788   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2789   int cmp;
2790
2791   /* Stabilize sort.  */
2792   if (dra == drb)
2793     return 0;
2794
2795   /* DRs in different loops never belong to the same group.  */
2796   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2797   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2798   if (loopa != loopb)
2799     return loopa->num < loopb->num ? -1 : 1;
2800
2801   /* Ordering of DRs according to base.  */
2802   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2803                                DR_BASE_ADDRESS (drb));
2804   if (cmp != 0)
2805     return cmp;
2806
2807   /* And according to DR_OFFSET.  */
2808   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2809   if (cmp != 0)
2810     return cmp;
2811
2812   /* Put reads before writes.  */
2813   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2814     return DR_IS_READ (dra) ? -1 : 1;
2815
2816   /* Then sort after access size.  */
2817   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2818                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2819   if (cmp != 0)
2820     return cmp;
2821
2822   /* And after step.  */
2823   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2824   if (cmp != 0)
2825     return cmp;
2826
2827   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2828   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2829   if (cmp == 0)
2830     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2831   return cmp;
2832 }
2833
2834 /* If OP is the result of a conversion, return the unconverted value,
2835    otherwise return null.  */
2836
2837 static tree
2838 strip_conversion (tree op)
2839 {
2840   if (TREE_CODE (op) != SSA_NAME)
2841     return NULL_TREE;
2842   gimple *stmt = SSA_NAME_DEF_STMT (op);
2843   if (!is_gimple_assign (stmt)
2844       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2845     return NULL_TREE;
2846   return gimple_assign_rhs1 (stmt);
2847 }
2848
2849 /* Return true if vectorizable_* routines can handle statements STMT1
2850    and STMT2 being in a single group.  */
2851
2852 static bool
2853 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2854 {
2855   if (gimple_assign_single_p (stmt1))
2856     return gimple_assign_single_p (stmt2);
2857
2858   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2859     {
2860       /* Check for two masked loads or two masked stores.  */
2861       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2862         return false;
2863       internal_fn ifn = gimple_call_internal_fn (stmt1);
2864       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2865         return false;
2866       if (ifn != gimple_call_internal_fn (stmt2))
2867         return false;
2868
2869       /* Check that the masks are the same.  Cope with casts of masks,
2870          like those created by build_mask_conversion.  */
2871       tree mask1 = gimple_call_arg (stmt1, 2);
2872       tree mask2 = gimple_call_arg (stmt2, 2);
2873       if (!operand_equal_p (mask1, mask2, 0))
2874         {
2875           mask1 = strip_conversion (mask1);
2876           if (!mask1)
2877             return false;
2878           mask2 = strip_conversion (mask2);
2879           if (!mask2)
2880             return false;
2881           if (!operand_equal_p (mask1, mask2, 0))
2882             return false;
2883         }
2884       return true;
2885     }
2886
2887   return false;
2888 }
2889
2890 /* Function vect_analyze_data_ref_accesses.
2891
2892    Analyze the access pattern of all the data references in the loop.
2893
2894    FORNOW: the only access pattern that is considered vectorizable is a
2895            simple step 1 (consecutive) access.
2896
2897    FORNOW: handle only arrays and pointer accesses.  */
2898
2899 bool
2900 vect_analyze_data_ref_accesses (vec_info *vinfo)
2901 {
2902   unsigned int i;
2903   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2904   struct data_reference *dr;
2905
2906   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2907
2908   if (datarefs.is_empty ())
2909     return true;
2910
2911   /* Sort the array of datarefs to make building the interleaving chains
2912      linear.  Don't modify the original vector's order, it is needed for
2913      determining what dependencies are reversed.  */
2914   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2915   datarefs_copy.qsort (dr_group_sort_cmp);
2916
2917   /* Build the interleaving chains.  */
2918   for (i = 0; i < datarefs_copy.length () - 1;)
2919     {
2920       data_reference_p dra = datarefs_copy[i];
2921       stmt_vec_info stmtinfo_a = vect_dr_stmt (dra);
2922       stmt_vec_info lastinfo = NULL;
2923       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2924           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2925         {
2926           ++i;
2927           continue;
2928         }
2929       for (i = i + 1; i < datarefs_copy.length (); ++i)
2930         {
2931           data_reference_p drb = datarefs_copy[i];
2932           stmt_vec_info stmtinfo_b = vect_dr_stmt (drb);
2933           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2934               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2935             break;
2936
2937           /* ???  Imperfect sorting (non-compatible types, non-modulo
2938              accesses, same accesses) can lead to a group to be artificially
2939              split here as we don't just skip over those.  If it really
2940              matters we can push those to a worklist and re-iterate
2941              over them.  The we can just skip ahead to the next DR here.  */
2942
2943           /* DRs in a different loop should not be put into the same
2944              interleaving group.  */
2945           if (gimple_bb (DR_STMT (dra))->loop_father
2946               != gimple_bb (DR_STMT (drb))->loop_father)
2947             break;
2948
2949           /* Check that the data-refs have same first location (except init)
2950              and they are both either store or load (not load and store,
2951              not masked loads or stores).  */
2952           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2953               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2954                                         DR_BASE_ADDRESS (drb)) != 0
2955               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2956               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2957             break;
2958
2959           /* Check that the data-refs have the same constant size.  */
2960           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2961           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2962           if (!tree_fits_uhwi_p (sza)
2963               || !tree_fits_uhwi_p (szb)
2964               || !tree_int_cst_equal (sza, szb))
2965             break;
2966
2967           /* Check that the data-refs have the same step.  */
2968           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2969             break;
2970
2971           /* Check the types are compatible.
2972              ???  We don't distinguish this during sorting.  */
2973           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2974                                    TREE_TYPE (DR_REF (drb))))
2975             break;
2976
2977           /* Check that the DR_INITs are compile-time constants.  */
2978           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2979               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2980             break;
2981
2982           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2983           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2984           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2985           HOST_WIDE_INT init_prev
2986             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2987           gcc_assert (init_a <= init_b
2988                       && init_a <= init_prev
2989                       && init_prev <= init_b);
2990
2991           /* Do not place the same access in the interleaving chain twice.  */
2992           if (init_b == init_prev)
2993             {
2994               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2995                           < gimple_uid (DR_STMT (drb)));
2996               /* ???  For now we simply "drop" the later reference which is
2997                  otherwise the same rather than finishing off this group.
2998                  In the end we'd want to re-process duplicates forming
2999                  multiple groups from the refs, likely by just collecting
3000                  all candidates (including duplicates and split points
3001                  below) in a vector and then process them together.  */
3002               continue;
3003             }
3004
3005           /* If init_b == init_a + the size of the type * k, we have an
3006              interleaving, and DRA is accessed before DRB.  */
3007           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3008           if (type_size_a == 0
3009               || (init_b - init_a) % type_size_a != 0)
3010             break;
3011
3012           /* If we have a store, the accesses are adjacent.  This splits
3013              groups into chunks we support (we don't support vectorization
3014              of stores with gaps).  */
3015           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3016             break;
3017
3018           /* If the step (if not zero or non-constant) is greater than the
3019              difference between data-refs' inits this splits groups into
3020              suitable sizes.  */
3021           if (tree_fits_shwi_p (DR_STEP (dra)))
3022             {
3023               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3024               if (step != 0 && step <= (init_b - init_a))
3025                 break;
3026             }
3027
3028           if (dump_enabled_p ())
3029             {
3030               dump_printf_loc (MSG_NOTE, vect_location,
3031                                "Detected interleaving ");
3032               if (DR_IS_READ (dra))
3033                 dump_printf (MSG_NOTE, "load ");
3034               else
3035                 dump_printf (MSG_NOTE, "store ");
3036               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3037               dump_printf (MSG_NOTE,  " and ");
3038               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3039               dump_printf (MSG_NOTE, "\n");
3040             }
3041
3042           /* Link the found element into the group list.  */
3043           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3044             {
3045               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3046               lastinfo = stmtinfo_a;
3047             }
3048           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3049           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3050           lastinfo = stmtinfo_b;
3051         }
3052     }
3053
3054   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3055     if (STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr))
3056         && !vect_analyze_data_ref_access (dr))
3057       {
3058         if (dump_enabled_p ())
3059           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3060                            "not vectorized: complicated access pattern.\n");
3061
3062         if (is_a <bb_vec_info> (vinfo))
3063           {
3064             /* Mark the statement as not vectorizable.  */
3065             STMT_VINFO_VECTORIZABLE (vect_dr_stmt (dr)) = false;
3066             continue;
3067           }
3068         else
3069           {
3070             datarefs_copy.release ();
3071             return false;
3072           }
3073       }
3074
3075   datarefs_copy.release ();
3076   return true;
3077 }
3078
3079 /* Function vect_vfa_segment_size.
3080
3081    Input:
3082      DR: The data reference.
3083      LENGTH_FACTOR: segment length to consider.
3084
3085    Return a value suitable for the dr_with_seg_len::seg_len field.
3086    This is the "distance travelled" by the pointer from the first
3087    iteration in the segment to the last.  Note that it does not include
3088    the size of the access; in effect it only describes the first byte.  */
3089
3090 static tree
3091 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3092 {
3093   length_factor = size_binop (MINUS_EXPR,
3094                               fold_convert (sizetype, length_factor),
3095                               size_one_node);
3096   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3097                      length_factor);
3098 }
3099
3100 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3101    gives the worst-case number of bytes covered by the segment.  */
3102
3103 static unsigned HOST_WIDE_INT
3104 vect_vfa_access_size (data_reference *dr)
3105 {
3106   stmt_vec_info stmt_vinfo = vect_dr_stmt (dr);
3107   tree ref_type = TREE_TYPE (DR_REF (dr));
3108   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3109   unsigned HOST_WIDE_INT access_size = ref_size;
3110   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3111     {
3112       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3113       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3114     }
3115   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3116       && (vect_supportable_dr_alignment (dr, false)
3117           == dr_explicit_realign_optimized))
3118     {
3119       /* We might access a full vector's worth.  */
3120       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3121       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3122     }
3123   return access_size;
3124 }
3125
3126 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3127
3128 static unsigned int
3129 vect_vfa_align (const data_reference *dr)
3130 {
3131   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3132 }
3133
3134 /* Function vect_no_alias_p.
3135
3136    Given data references A and B with equal base and offset, see whether
3137    the alias relation can be decided at compilation time.  Return 1 if
3138    it can and the references alias, 0 if it can and the references do
3139    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3140    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3141    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3142
3143 static int
3144 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3145                          tree segment_length_a, tree segment_length_b,
3146                          unsigned HOST_WIDE_INT access_size_a,
3147                          unsigned HOST_WIDE_INT access_size_b)
3148 {
3149   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3150   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3151   poly_uint64 const_length_a;
3152   poly_uint64 const_length_b;
3153
3154   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3155      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3156      [a, a+12) */
3157   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3158     {
3159       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3160       offset_a = (offset_a + access_size_a) - const_length_a;
3161     }
3162   else
3163     const_length_a = tree_to_poly_uint64 (segment_length_a);
3164   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3165     {
3166       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3167       offset_b = (offset_b + access_size_b) - const_length_b;
3168     }
3169   else
3170     const_length_b = tree_to_poly_uint64 (segment_length_b);
3171
3172   const_length_a += access_size_a;
3173   const_length_b += access_size_b;
3174
3175   if (ranges_known_overlap_p (offset_a, const_length_a,
3176                               offset_b, const_length_b))
3177     return 1;
3178
3179   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3180                                offset_b, const_length_b))
3181     return 0;
3182
3183   return -1;
3184 }
3185
3186 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3187    in DDR is >= VF.  */
3188
3189 static bool
3190 dependence_distance_ge_vf (data_dependence_relation *ddr,
3191                            unsigned int loop_depth, poly_uint64 vf)
3192 {
3193   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3194       || DDR_NUM_DIST_VECTS (ddr) == 0)
3195     return false;
3196
3197   /* If the dependence is exact, we should have limited the VF instead.  */
3198   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3199
3200   unsigned int i;
3201   lambda_vector dist_v;
3202   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3203     {
3204       HOST_WIDE_INT dist = dist_v[loop_depth];
3205       if (dist != 0
3206           && !(dist > 0 && DDR_REVERSED_P (ddr))
3207           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3208         return false;
3209     }
3210
3211   if (dump_enabled_p ())
3212     {
3213       dump_printf_loc (MSG_NOTE, vect_location,
3214                        "dependence distance between ");
3215       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3216       dump_printf (MSG_NOTE,  " and ");
3217       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3218       dump_printf (MSG_NOTE,  " is >= VF\n");
3219     }
3220
3221   return true;
3222 }
3223
3224 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3225
3226 static void
3227 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3228 {
3229   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3230   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3231   dump_printf (dump_kind, ") >= ");
3232   dump_dec (dump_kind, lower_bound.min_value);
3233 }
3234
3235 /* Record that the vectorized loop requires the vec_lower_bound described
3236    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3237
3238 static void
3239 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3240                         poly_uint64 min_value)
3241 {
3242   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3243   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3244     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3245       {
3246         unsigned_p &= lower_bounds[i].unsigned_p;
3247         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3248         if (lower_bounds[i].unsigned_p != unsigned_p
3249             || maybe_lt (lower_bounds[i].min_value, min_value))
3250           {
3251             lower_bounds[i].unsigned_p = unsigned_p;
3252             lower_bounds[i].min_value = min_value;
3253             if (dump_enabled_p ())
3254               {
3255                 dump_printf_loc (MSG_NOTE, vect_location,
3256                                  "updating run-time check to ");
3257                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3258                 dump_printf (MSG_NOTE, "\n");
3259               }
3260           }
3261         return;
3262       }
3263
3264   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3265   if (dump_enabled_p ())
3266     {
3267       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3268       dump_lower_bound (MSG_NOTE, lower_bound);
3269       dump_printf (MSG_NOTE, "\n");
3270     }
3271   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3272 }
3273
3274 /* Return true if it's unlikely that the step of the vectorized form of DR
3275    will span fewer than GAP bytes.  */
3276
3277 static bool
3278 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3279 {
3280   stmt_vec_info stmt_info = vect_dr_stmt (dr);
3281   HOST_WIDE_INT count
3282     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3283   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3284     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3285   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3286 }
3287
3288 /* Return true if we know that there is no alias between DR_A and DR_B
3289    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3290    *LOWER_BOUND_OUT to this N.  */
3291
3292 static bool
3293 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3294                                 poly_uint64 *lower_bound_out)
3295 {
3296   /* Check that there is a constant gap of known sign between DR_A
3297      and DR_B.  */
3298   poly_int64 init_a, init_b;
3299   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3300       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3301       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3302       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3303       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3304       || !ordered_p (init_a, init_b))
3305     return false;
3306
3307   /* Sort DR_A and DR_B by the address they access.  */
3308   if (maybe_lt (init_b, init_a))
3309     {
3310       std::swap (init_a, init_b);
3311       std::swap (dr_a, dr_b);
3312     }
3313
3314   /* If the two accesses could be dependent within a scalar iteration,
3315      make sure that we'd retain their order.  */
3316   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3317       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3318                                          vect_dr_stmt (dr_b)))
3319     return false;
3320
3321   /* There is no alias if abs (DR_STEP) is greater than or equal to
3322      the bytes spanned by the combination of the two accesses.  */
3323   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3324   return true;
3325 }
3326
3327 /* Function vect_prune_runtime_alias_test_list.
3328
3329    Prune a list of ddrs to be tested at run-time by versioning for alias.
3330    Merge several alias checks into one if possible.
3331    Return FALSE if resulting list of ddrs is longer then allowed by
3332    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3333
3334 bool
3335 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3336 {
3337   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3338   hash_set <tree_pair_hash> compared_objects;
3339
3340   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3341   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3342     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3343   vec<vec_object_pair> &check_unequal_addrs
3344     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3345   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3346   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3347
3348   ddr_p ddr;
3349   unsigned int i;
3350   tree length_factor;
3351
3352   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3353
3354   /* Step values are irrelevant for aliasing if the number of vector
3355      iterations is equal to the number of scalar iterations (which can
3356      happen for fully-SLP loops).  */
3357   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3358
3359   if (!ignore_step_p)
3360     {
3361       /* Convert the checks for nonzero steps into bound tests.  */
3362       tree value;
3363       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3364         vect_check_lower_bound (loop_vinfo, value, true, 1);
3365     }
3366
3367   if (may_alias_ddrs.is_empty ())
3368     return true;
3369
3370   comp_alias_ddrs.create (may_alias_ddrs.length ());
3371
3372   unsigned int loop_depth
3373     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3374                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3375
3376   /* First, we collect all data ref pairs for aliasing checks.  */
3377   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3378     {
3379       int comp_res;
3380       poly_uint64 lower_bound;
3381       struct data_reference *dr_a, *dr_b;
3382       gimple *dr_group_first_a, *dr_group_first_b;
3383       tree segment_length_a, segment_length_b;
3384       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3385       unsigned int align_a, align_b;
3386       gimple *stmt_a, *stmt_b;
3387
3388       /* Ignore the alias if the VF we chose ended up being no greater
3389          than the dependence distance.  */
3390       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3391         continue;
3392
3393       if (DDR_OBJECT_A (ddr))
3394         {
3395           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3396           if (!compared_objects.add (new_pair))
3397             {
3398               if (dump_enabled_p ())
3399                 {
3400                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3401                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3402                   dump_printf (MSG_NOTE, " and ");
3403                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3404                   dump_printf (MSG_NOTE, " have different addresses\n");
3405                 }
3406               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3407             }
3408           continue;
3409         }
3410
3411       dr_a = DDR_A (ddr);
3412       stmt_a = vect_dr_stmt (DDR_A (ddr));
3413
3414       dr_b = DDR_B (ddr);
3415       stmt_b = vect_dr_stmt (DDR_B (ddr));
3416
3417       /* Skip the pair if inter-iteration dependencies are irrelevant
3418          and intra-iteration dependencies are guaranteed to be honored.  */
3419       if (ignore_step_p
3420           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3421               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3422         {
3423           if (dump_enabled_p ())
3424             {
3425               dump_printf_loc (MSG_NOTE, vect_location,
3426                                "no need for alias check between ");
3427               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3428               dump_printf (MSG_NOTE, " and ");
3429               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3430               dump_printf (MSG_NOTE, " when VF is 1\n");
3431             }
3432           continue;
3433         }
3434
3435       /* See whether we can handle the alias using a bounds check on
3436          the step, and whether that's likely to be the best approach.
3437          (It might not be, for example, if the minimum step is much larger
3438          than the number of bytes handled by one vector iteration.)  */
3439       if (!ignore_step_p
3440           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3441           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3442           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3443               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3444         {
3445           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3446           if (dump_enabled_p ())
3447             {
3448               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3449               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3450               dump_printf (MSG_NOTE, " and ");
3451               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3452               dump_printf (MSG_NOTE, " when the step ");
3453               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3454               dump_printf (MSG_NOTE, " is outside ");
3455               if (unsigned_p)
3456                 dump_printf (MSG_NOTE, "[0");
3457               else
3458                 {
3459                   dump_printf (MSG_NOTE, "(");
3460                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3461                 }
3462               dump_printf (MSG_NOTE, ", ");
3463               dump_dec (MSG_NOTE, lower_bound);
3464               dump_printf (MSG_NOTE, ")\n");
3465             }
3466           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3467                                   lower_bound);
3468           continue;
3469         }
3470
3471       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3472       if (dr_group_first_a)
3473         {
3474           stmt_a = dr_group_first_a;
3475           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3476         }
3477
3478       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3479       if (dr_group_first_b)
3480         {
3481           stmt_b = dr_group_first_b;
3482           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3483         }
3484
3485       if (ignore_step_p)
3486         {
3487           segment_length_a = size_zero_node;
3488           segment_length_b = size_zero_node;
3489         }
3490       else
3491         {
3492           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3493             length_factor = scalar_loop_iters;
3494           else
3495             length_factor = size_int (vect_factor);
3496           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3497           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3498         }
3499       access_size_a = vect_vfa_access_size (dr_a);
3500       access_size_b = vect_vfa_access_size (dr_b);
3501       align_a = vect_vfa_align (dr_a);
3502       align_b = vect_vfa_align (dr_b);
3503
3504       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3505                                         DR_BASE_ADDRESS (dr_b));
3506       if (comp_res == 0)
3507         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3508                                           DR_OFFSET (dr_b));
3509
3510       /* See whether the alias is known at compilation time.  */
3511       if (comp_res == 0
3512           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3513           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3514           && poly_int_tree_p (segment_length_a)
3515           && poly_int_tree_p (segment_length_b))
3516         {
3517           int res = vect_compile_time_alias (dr_a, dr_b,
3518                                              segment_length_a,
3519                                              segment_length_b,
3520                                              access_size_a,
3521                                              access_size_b);
3522           if (res >= 0 && dump_enabled_p ())
3523             {
3524               dump_printf_loc (MSG_NOTE, vect_location,
3525                                "can tell at compile time that ");
3526               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3527               dump_printf (MSG_NOTE, " and ");
3528               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3529               if (res == 0)
3530                 dump_printf (MSG_NOTE, " do not alias\n");
3531               else
3532                 dump_printf (MSG_NOTE, " alias\n");
3533             }
3534
3535           if (res == 0)
3536             continue;
3537
3538           if (res == 1)
3539             {
3540               if (dump_enabled_p ())
3541                 dump_printf_loc (MSG_NOTE, vect_location,
3542                                  "not vectorized: compilation time alias.\n");
3543               return false;
3544             }
3545         }
3546
3547       dr_with_seg_len_pair_t dr_with_seg_len_pair
3548         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3549          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3550
3551       /* Canonicalize pairs by sorting the two DR members.  */
3552       if (comp_res > 0)
3553         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3554
3555       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3556     }
3557
3558   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3559
3560   unsigned int count = (comp_alias_ddrs.length ()
3561                         + check_unequal_addrs.length ());
3562
3563   dump_printf_loc (MSG_NOTE, vect_location,
3564                    "improved number of alias checks from %d to %d\n",
3565                    may_alias_ddrs.length (), count);
3566   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3567     {
3568       if (dump_enabled_p ())
3569         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570                          "number of versioning for alias "
3571                          "run-time tests exceeds %d "
3572                          "(--param vect-max-version-for-alias-checks)\n",
3573                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3574       return false;
3575     }
3576
3577   return true;
3578 }
3579
3580 /* Check whether we can use an internal function for a gather load
3581    or scatter store.  READ_P is true for loads and false for stores.
3582    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3583    the type of the memory elements being loaded or stored.  OFFSET_BITS
3584    is the number of bits in each scalar offset and OFFSET_SIGN is the
3585    sign of the offset.  SCALE is the amount by which the offset should
3586    be multiplied *after* it has been converted to address width.
3587
3588    Return true if the function is supported, storing the function
3589    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3590
3591 bool
3592 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3593                           tree memory_type, unsigned int offset_bits,
3594                           signop offset_sign, int scale,
3595                           internal_fn *ifn_out, tree *element_type_out)
3596 {
3597   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3598   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3599   if (offset_bits > element_bits)
3600     /* Internal functions require the offset to be the same width as
3601        the vector elements.  We can extend narrower offsets, but it isn't
3602        safe to truncate wider offsets.  */
3603     return false;
3604
3605   if (element_bits != memory_bits)
3606     /* For now the vector elements must be the same width as the
3607        memory elements.  */
3608     return false;
3609
3610   /* Work out which function we need.  */
3611   internal_fn ifn;
3612   if (read_p)
3613     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3614   else
3615     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3616
3617   /* Test whether the target supports this combination.  */
3618   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3619                                                offset_sign, scale))
3620     return false;
3621
3622   *ifn_out = ifn;
3623   *element_type_out = TREE_TYPE (vectype);
3624   return true;
3625 }
3626
3627 /* CALL is a call to an internal gather load or scatter store function.
3628    Describe the operation in INFO.  */
3629
3630 static void
3631 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3632 {
3633   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3634   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3635   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3636
3637   info->ifn = gimple_call_internal_fn (call);
3638   info->decl = NULL_TREE;
3639   info->base = gimple_call_arg (call, 0);
3640   info->offset = gimple_call_arg (call, 1);
3641   info->offset_dt = vect_unknown_def_type;
3642   info->offset_vectype = NULL_TREE;
3643   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3644   info->element_type = TREE_TYPE (vectype);
3645   info->memory_type = TREE_TYPE (DR_REF (dr));
3646 }
3647
3648 /* Return true if a non-affine read or write in STMT is suitable for a
3649    gather load or scatter store.  Describe the operation in *INFO if so.  */
3650
3651 bool
3652 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3653                            gather_scatter_info *info)
3654 {
3655   HOST_WIDE_INT scale = 1;
3656   poly_int64 pbitpos, pbitsize;
3657   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3658   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3659   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3660   tree offtype = NULL_TREE;
3661   tree decl = NULL_TREE, base, off;
3662   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3663   tree memory_type = TREE_TYPE (DR_REF (dr));
3664   machine_mode pmode;
3665   int punsignedp, reversep, pvolatilep = 0;
3666   internal_fn ifn;
3667   tree element_type;
3668   bool masked_p = false;
3669
3670   /* See whether this is already a call to a gather/scatter internal function.
3671      If not, see whether it's a masked load or store.  */
3672   gcall *call = dyn_cast <gcall *> (stmt);
3673   if (call && gimple_call_internal_p (call))
3674     {
3675       ifn = gimple_call_internal_fn (stmt);
3676       if (internal_gather_scatter_fn_p (ifn))
3677         {
3678           vect_describe_gather_scatter_call (call, info);
3679           return true;
3680         }
3681       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3682     }
3683
3684   /* True if we should aim to use internal functions rather than
3685      built-in functions.  */
3686   bool use_ifn_p = (DR_IS_READ (dr)
3687                     ? supports_vec_gather_load_p ()
3688                     : supports_vec_scatter_store_p ());
3689
3690   base = DR_REF (dr);
3691   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3692      see if we can use the def stmt of the address.  */
3693   if (masked_p
3694       && TREE_CODE (base) == MEM_REF
3695       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3696       && integer_zerop (TREE_OPERAND (base, 1))
3697       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3698     {
3699       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3700       if (is_gimple_assign (def_stmt)
3701           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3702         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3703     }
3704
3705   /* The gather and scatter builtins need address of the form
3706      loop_invariant + vector * {1, 2, 4, 8}
3707      or
3708      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3709      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3710      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3711      multiplications and additions in it.  To get a vector, we need
3712      a single SSA_NAME that will be defined in the loop and will
3713      contain everything that is not loop invariant and that can be
3714      vectorized.  The following code attempts to find such a preexistng
3715      SSA_NAME OFF and put the loop invariants into a tree BASE
3716      that can be gimplified before the loop.  */
3717   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3718                               &punsignedp, &reversep, &pvolatilep);
3719   if (reversep)
3720     return false;
3721
3722   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3723
3724   if (TREE_CODE (base) == MEM_REF)
3725     {
3726       if (!integer_zerop (TREE_OPERAND (base, 1)))
3727         {
3728           if (off == NULL_TREE)
3729             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3730           else
3731             off = size_binop (PLUS_EXPR, off,
3732                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3733         }
3734       base = TREE_OPERAND (base, 0);
3735     }
3736   else
3737     base = build_fold_addr_expr (base);
3738
3739   if (off == NULL_TREE)
3740     off = size_zero_node;
3741
3742   /* If base is not loop invariant, either off is 0, then we start with just
3743      the constant offset in the loop invariant BASE and continue with base
3744      as OFF, otherwise give up.
3745      We could handle that case by gimplifying the addition of base + off
3746      into some SSA_NAME and use that as off, but for now punt.  */
3747   if (!expr_invariant_in_loop_p (loop, base))
3748     {
3749       if (!integer_zerop (off))
3750         return false;
3751       off = base;
3752       base = size_int (pbytepos);
3753     }
3754   /* Otherwise put base + constant offset into the loop invariant BASE
3755      and continue with OFF.  */
3756   else
3757     {
3758       base = fold_convert (sizetype, base);
3759       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3760     }
3761
3762   /* OFF at this point may be either a SSA_NAME or some tree expression
3763      from get_inner_reference.  Try to peel off loop invariants from it
3764      into BASE as long as possible.  */
3765   STRIP_NOPS (off);
3766   while (offtype == NULL_TREE)
3767     {
3768       enum tree_code code;
3769       tree op0, op1, add = NULL_TREE;
3770
3771       if (TREE_CODE (off) == SSA_NAME)
3772         {
3773           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3774
3775           if (expr_invariant_in_loop_p (loop, off))
3776             return false;
3777
3778           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3779             break;
3780
3781           op0 = gimple_assign_rhs1 (def_stmt);
3782           code = gimple_assign_rhs_code (def_stmt);
3783           op1 = gimple_assign_rhs2 (def_stmt);
3784         }
3785       else
3786         {
3787           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3788             return false;
3789           code = TREE_CODE (off);
3790           extract_ops_from_tree (off, &code, &op0, &op1);
3791         }
3792       switch (code)
3793         {
3794         case POINTER_PLUS_EXPR:
3795         case PLUS_EXPR:
3796           if (expr_invariant_in_loop_p (loop, op0))
3797             {
3798               add = op0;
3799               off = op1;
3800             do_add:
3801               add = fold_convert (sizetype, add);
3802               if (scale != 1)
3803                 add = size_binop (MULT_EXPR, add, size_int (scale));
3804               base = size_binop (PLUS_EXPR, base, add);
3805               continue;
3806             }
3807           if (expr_invariant_in_loop_p (loop, op1))
3808             {
3809               add = op1;
3810               off = op0;
3811               goto do_add;
3812             }
3813           break;
3814         case MINUS_EXPR:
3815           if (expr_invariant_in_loop_p (loop, op1))
3816             {
3817               add = fold_convert (sizetype, op1);
3818               add = size_binop (MINUS_EXPR, size_zero_node, add);
3819               off = op0;
3820               goto do_add;
3821             }
3822           break;
3823         case MULT_EXPR:
3824           if (scale == 1 && tree_fits_shwi_p (op1))
3825             {
3826               int new_scale = tree_to_shwi (op1);
3827               /* Only treat this as a scaling operation if the target
3828                  supports it.  */
3829               if (use_ifn_p
3830                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3831                                                 vectype, memory_type, 1,
3832                                                 TYPE_SIGN (TREE_TYPE (op0)),
3833                                                 new_scale, &ifn,
3834                                                 &element_type))
3835                 break;
3836               scale = new_scale;
3837               off = op0;
3838               continue;
3839             }
3840           break;
3841         case SSA_NAME:
3842           off = op0;
3843           continue;
3844         CASE_CONVERT:
3845           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3846               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3847             break;
3848           if (TYPE_PRECISION (TREE_TYPE (op0))
3849               == TYPE_PRECISION (TREE_TYPE (off)))
3850             {
3851               off = op0;
3852               continue;
3853             }
3854
3855           /* The internal functions need the offset to be the same width
3856              as the elements of VECTYPE.  Don't include operations that
3857              cast the offset from that width to a different width.  */
3858           if (use_ifn_p
3859               && (int_size_in_bytes (TREE_TYPE (vectype))
3860                   == int_size_in_bytes (TREE_TYPE (off))))
3861             break;
3862
3863           if (TYPE_PRECISION (TREE_TYPE (op0))
3864               < TYPE_PRECISION (TREE_TYPE (off)))
3865             {
3866               off = op0;
3867               offtype = TREE_TYPE (off);
3868               STRIP_NOPS (off);
3869               continue;
3870             }
3871           break;
3872         default:
3873           break;
3874         }
3875       break;
3876     }
3877
3878   /* If at the end OFF still isn't a SSA_NAME or isn't
3879      defined in the loop, punt.  */
3880   if (TREE_CODE (off) != SSA_NAME
3881       || expr_invariant_in_loop_p (loop, off))
3882     return false;
3883
3884   if (offtype == NULL_TREE)
3885     offtype = TREE_TYPE (off);
3886
3887   if (use_ifn_p)
3888     {
3889       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3890                                      memory_type, TYPE_PRECISION (offtype),
3891                                      TYPE_SIGN (offtype), scale, &ifn,
3892                                      &element_type))
3893         return false;
3894     }
3895   else
3896     {
3897       if (DR_IS_READ (dr))
3898         {
3899           if (targetm.vectorize.builtin_gather)
3900             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3901         }
3902       else
3903         {
3904           if (targetm.vectorize.builtin_scatter)
3905             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3906         }
3907
3908       if (!decl)
3909         return false;
3910
3911       ifn = IFN_LAST;
3912       element_type = TREE_TYPE (vectype);
3913     }
3914
3915   info->ifn = ifn;
3916   info->decl = decl;
3917   info->base = base;
3918   info->offset = off;
3919   info->offset_dt = vect_unknown_def_type;
3920   info->offset_vectype = NULL_TREE;
3921   info->scale = scale;
3922   info->element_type = element_type;
3923   info->memory_type = memory_type;
3924   return true;
3925 }
3926
3927 /* Find the data references in STMT, analyze them with respect to LOOP and
3928    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3929    be handled.  */
3930
3931 bool
3932 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3933                                vec<data_reference_p> *datarefs)
3934 {
3935   /* We can ignore clobbers for dataref analysis - they are removed during
3936      loop vectorization and BB vectorization checks dependences with a
3937      stmt walk.  */
3938   if (gimple_clobber_p (stmt))
3939     return true;
3940
3941   if (gimple_has_volatile_ops (stmt))
3942     {
3943       if (dump_enabled_p ())
3944         {
3945           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3946                            "not vectorized: volatile type ");
3947           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3948         }
3949       return false;
3950     }
3951
3952   if (stmt_can_throw_internal (stmt))
3953     {
3954       if (dump_enabled_p ())
3955         {
3956           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3957                            "not vectorized: statement can throw an "
3958                            "exception ");
3959           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3960         }
3961       return false;
3962     }
3963
3964   auto_vec<data_reference_p, 2> refs;
3965   if (!find_data_references_in_stmt (loop, stmt, &refs))
3966     return false;
3967
3968   if (refs.is_empty ())
3969     return true;
3970
3971   if (refs.length () > 1)
3972     {
3973       if (dump_enabled_p ())
3974         {
3975           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976                            "not vectorized: more than one data ref "
3977                            "in stmt: ");
3978           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3979         }
3980       return false;
3981     }
3982
3983   if (gcall *call = dyn_cast <gcall *> (stmt))
3984     if (!gimple_call_internal_p (call)
3985         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
3986             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
3987       {
3988         if (dump_enabled_p ())
3989           {
3990             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3991                              "not vectorized: dr in a call ");
3992             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3993           }
3994         return false;
3995       }
3996
3997   data_reference_p dr = refs.pop ();
3998   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3999       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4000     {
4001       if (dump_enabled_p ())
4002         {
4003           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4004                            "not vectorized: statement is bitfield "
4005                            "access ");
4006           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4007         }
4008       return false;
4009     }
4010
4011   if (DR_BASE_ADDRESS (dr)
4012       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4013     {
4014       if (dump_enabled_p ())
4015         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4016                          "not vectorized: base addr of dr is a "
4017                          "constant\n");
4018       return false;
4019     }
4020
4021   /* Check whether this may be a SIMD lane access and adjust the
4022      DR to make it easier for us to handle it.  */
4023   if (loop
4024       && loop->simduid
4025       && (!DR_BASE_ADDRESS (dr)
4026           || !DR_OFFSET (dr)
4027           || !DR_INIT (dr)
4028           || !DR_STEP (dr)))
4029     {
4030       struct data_reference *newdr
4031         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4032                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4033       if (DR_BASE_ADDRESS (newdr)
4034           && DR_OFFSET (newdr)
4035           && DR_INIT (newdr)
4036           && DR_STEP (newdr)
4037           && integer_zerop (DR_STEP (newdr)))
4038         {
4039           tree off = DR_OFFSET (newdr);
4040           STRIP_NOPS (off);
4041           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4042               && TREE_CODE (off) == MULT_EXPR
4043               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4044             {
4045               tree step = TREE_OPERAND (off, 1);
4046               off = TREE_OPERAND (off, 0);
4047               STRIP_NOPS (off);
4048               if (CONVERT_EXPR_P (off)
4049                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4050                       < TYPE_PRECISION (TREE_TYPE (off))))
4051                 off = TREE_OPERAND (off, 0);
4052               if (TREE_CODE (off) == SSA_NAME)
4053                 {
4054                   gimple *def = SSA_NAME_DEF_STMT (off);
4055                   tree reft = TREE_TYPE (DR_REF (newdr));
4056                   if (is_gimple_call (def)
4057                       && gimple_call_internal_p (def)
4058                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4059                     {
4060                       tree arg = gimple_call_arg (def, 0);
4061                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4062                       arg = SSA_NAME_VAR (arg);
4063                       if (arg == loop->simduid
4064                           /* For now.  */
4065                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4066                         {
4067                           DR_OFFSET (newdr) = ssize_int (0);
4068                           DR_STEP (newdr) = step;
4069                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4070                           DR_STEP_ALIGNMENT (newdr)
4071                             = highest_pow2_factor (step);
4072                           /* Mark as simd-lane access.  */
4073                           newdr->aux = (void *)-1;
4074                           free_data_ref (dr);
4075                           datarefs->safe_push (newdr);
4076                           return true;
4077                         }
4078                     }
4079                 }
4080             }
4081         }
4082       free_data_ref (newdr);
4083     }
4084
4085   datarefs->safe_push (dr);
4086   return true;
4087 }
4088
4089 /* Function vect_analyze_data_refs.
4090
4091   Find all the data references in the loop or basic block.
4092
4093    The general structure of the analysis of data refs in the vectorizer is as
4094    follows:
4095    1- vect_analyze_data_refs(loop/bb): call
4096       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4097       in the loop/bb and their dependences.
4098    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4099    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4100    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4101
4102 */
4103
4104 bool
4105 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4106 {
4107   struct loop *loop = NULL;
4108   unsigned int i;
4109   struct data_reference *dr;
4110   tree scalar_type;
4111
4112   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4113
4114   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4115     loop = LOOP_VINFO_LOOP (loop_vinfo);
4116
4117   /* Go through the data-refs, check that the analysis succeeded.  Update
4118      pointer from stmt_vec_info struct to DR and vectype.  */
4119
4120   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4121   FOR_EACH_VEC_ELT (datarefs, i, dr)
4122     {
4123       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4124       poly_uint64 vf;
4125
4126       gcc_assert (DR_REF (dr));
4127       stmt_vec_info stmt_info = vect_dr_stmt (dr);
4128
4129       /* Check that analysis of the data-ref succeeded.  */
4130       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4131           || !DR_STEP (dr))
4132         {
4133           bool maybe_gather
4134             = DR_IS_READ (dr)
4135               && !TREE_THIS_VOLATILE (DR_REF (dr))
4136               && (targetm.vectorize.builtin_gather != NULL
4137                   || supports_vec_gather_load_p ());
4138           bool maybe_scatter
4139             = DR_IS_WRITE (dr)
4140               && !TREE_THIS_VOLATILE (DR_REF (dr))
4141               && (targetm.vectorize.builtin_scatter != NULL
4142                   || supports_vec_scatter_store_p ());
4143
4144           /* If target supports vector gather loads or scatter stores,
4145              see if they can't be used.  */
4146           if (is_a <loop_vec_info> (vinfo)
4147               && !nested_in_vect_loop_p (loop, stmt_info))
4148             {
4149               if (maybe_gather || maybe_scatter)
4150                 {
4151                   if (maybe_gather)
4152                     gatherscatter = GATHER;
4153                   else
4154                     gatherscatter = SCATTER;
4155                 }
4156             }
4157
4158           if (gatherscatter == SG_NONE)
4159             {
4160               if (dump_enabled_p ())
4161                 {
4162                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4163                                    "not vectorized: data ref analysis "
4164                                    "failed ");
4165                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4166                                     stmt_info->stmt, 0);
4167                 }
4168               if (is_a <bb_vec_info> (vinfo))
4169                 {
4170                   /* In BB vectorization the ref can still participate
4171                      in dependence analysis, we just can't vectorize it.  */
4172                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4173                   continue;
4174                 }
4175               return false;
4176             }
4177         }
4178
4179       /* See if this was detected as SIMD lane access.  */
4180       if (dr->aux == (void *)-1)
4181         {
4182           if (nested_in_vect_loop_p (loop, stmt_info))
4183             {
4184               if (dump_enabled_p ())
4185                 {
4186                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4187                                    "not vectorized: data ref analysis "
4188                                    "failed ");
4189                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4190                                     stmt_info->stmt, 0);
4191                 }
4192               return false;
4193             }
4194           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4195         }
4196
4197       tree base = get_base_address (DR_REF (dr));
4198       if (base && VAR_P (base) && DECL_NONALIASED (base))
4199         {
4200           if (dump_enabled_p ())
4201             {
4202               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4203                                "not vectorized: base object not addressable "
4204                                "for stmt: ");
4205               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4206                                 stmt_info->stmt, 0);
4207             }
4208           if (is_a <bb_vec_info> (vinfo))
4209             {
4210               /* In BB vectorization the ref can still participate
4211                  in dependence analysis, we just can't vectorize it.  */
4212               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4213               continue;
4214             }
4215           return false;
4216         }
4217
4218       if (is_a <loop_vec_info> (vinfo)
4219           && DR_STEP (dr)
4220           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4221         {
4222           if (nested_in_vect_loop_p (loop, stmt_info))
4223             {
4224               if (dump_enabled_p ())
4225                 {
4226                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4227                                    "not vectorized: not suitable for strided "
4228                                    "load ");
4229                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4230                                     stmt_info->stmt, 0);
4231                 }
4232               return false;
4233             }
4234           STMT_VINFO_STRIDED_P (stmt_info) = true;
4235         }
4236
4237       /* Update DR field in stmt_vec_info struct.  */
4238
4239       /* If the dataref is in an inner-loop of the loop that is considered for
4240          for vectorization, we also want to analyze the access relative to
4241          the outer-loop (DR contains information only relative to the
4242          inner-most enclosing loop).  We do that by building a reference to the
4243          first location accessed by the inner-loop, and analyze it relative to
4244          the outer-loop.  */
4245       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4246         {
4247           /* Build a reference to the first location accessed by the
4248              inner loop: *(BASE + INIT + OFFSET).  By construction,
4249              this address must be invariant in the inner loop, so we
4250              can consider it as being used in the outer loop.  */
4251           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4252           tree offset = unshare_expr (DR_OFFSET (dr));
4253           tree init = unshare_expr (DR_INIT (dr));
4254           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4255                                           init, offset);
4256           tree init_addr = fold_build_pointer_plus (base, init_offset);
4257           tree init_ref = build_fold_indirect_ref (init_addr);
4258
4259           if (dump_enabled_p ())
4260             {
4261               dump_printf_loc (MSG_NOTE, vect_location,
4262                                "analyze in outer loop: ");
4263               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4264               dump_printf (MSG_NOTE, "\n");
4265             }
4266
4267           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4268                                      init_ref, loop))
4269             /* dr_analyze_innermost already explained the failure.  */
4270             return false;
4271
4272           if (dump_enabled_p ())
4273             {
4274               dump_printf_loc (MSG_NOTE, vect_location,
4275                                "\touter base_address: ");
4276               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4277                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4278               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4279               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280                                  STMT_VINFO_DR_OFFSET (stmt_info));
4281               dump_printf (MSG_NOTE,
4282                            "\n\touter constant offset from base address: ");
4283               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4284                                  STMT_VINFO_DR_INIT (stmt_info));
4285               dump_printf (MSG_NOTE, "\n\touter step: ");
4286               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4287                                  STMT_VINFO_DR_STEP (stmt_info));
4288               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4289                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4290               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4291                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4292               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4293                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4294               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4295                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4296             }
4297         }
4298
4299       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4300       STMT_VINFO_DATA_REF (stmt_info) = dr;
4301
4302       /* Set vectype for STMT.  */
4303       scalar_type = TREE_TYPE (DR_REF (dr));
4304       STMT_VINFO_VECTYPE (stmt_info)
4305         = get_vectype_for_scalar_type (scalar_type);
4306       if (!STMT_VINFO_VECTYPE (stmt_info))
4307         {
4308           if (dump_enabled_p ())
4309             {
4310               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4311                                "not vectorized: no vectype for stmt: ");
4312               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4313                                 stmt_info->stmt, 0);
4314               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4315               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4316                                  scalar_type);
4317               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4318             }
4319
4320           if (is_a <bb_vec_info> (vinfo))
4321             {
4322               /* No vector type is fine, the ref can still participate
4323                  in dependence analysis, we just can't vectorize it.  */
4324               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4325               continue;
4326             }
4327           return false;
4328         }
4329       else
4330         {
4331           if (dump_enabled_p ())
4332             {
4333               dump_printf_loc (MSG_NOTE, vect_location,
4334                                "got vectype for stmt: ");
4335               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4336               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4337                                  STMT_VINFO_VECTYPE (stmt_info));
4338               dump_printf (MSG_NOTE, "\n");
4339             }
4340         }
4341
4342       /* Adjust the minimal vectorization factor according to the
4343          vector type.  */
4344       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4345       *min_vf = upper_bound (*min_vf, vf);
4346
4347       if (gatherscatter != SG_NONE)
4348         {
4349           gather_scatter_info gs_info;
4350           if (!vect_check_gather_scatter (stmt_info,
4351                                           as_a <loop_vec_info> (vinfo),
4352                                           &gs_info)
4353               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4354             {
4355               if (dump_enabled_p ())
4356                 {
4357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4358                                    (gatherscatter == GATHER) ?
4359                                    "not vectorized: not suitable for gather "
4360                                    "load " :
4361                                    "not vectorized: not suitable for scatter "
4362                                    "store ");
4363                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4364                                     stmt_info->stmt, 0);
4365                 }
4366               return false;
4367             }
4368           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4369         }
4370     }
4371
4372   /* We used to stop processing and prune the list here.  Verify we no
4373      longer need to.  */
4374   gcc_assert (i == datarefs.length ());
4375
4376   return true;
4377 }
4378
4379
4380 /* Function vect_get_new_vect_var.
4381
4382    Returns a name for a new variable.  The current naming scheme appends the
4383    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4384    the name of vectorizer generated variables, and appends that to NAME if
4385    provided.  */
4386
4387 tree
4388 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4389 {
4390   const char *prefix;
4391   tree new_vect_var;
4392
4393   switch (var_kind)
4394   {
4395   case vect_simple_var:
4396     prefix = "vect";
4397     break;
4398   case vect_scalar_var:
4399     prefix = "stmp";
4400     break;
4401   case vect_mask_var:
4402     prefix = "mask";
4403     break;
4404   case vect_pointer_var:
4405     prefix = "vectp";
4406     break;
4407   default:
4408     gcc_unreachable ();
4409   }
4410
4411   if (name)
4412     {
4413       char* tmp = concat (prefix, "_", name, NULL);
4414       new_vect_var = create_tmp_reg (type, tmp);
4415       free (tmp);
4416     }
4417   else
4418     new_vect_var = create_tmp_reg (type, prefix);
4419
4420   return new_vect_var;
4421 }
4422
4423 /* Like vect_get_new_vect_var but return an SSA name.  */
4424
4425 tree
4426 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4427 {
4428   const char *prefix;
4429   tree new_vect_var;
4430
4431   switch (var_kind)
4432   {
4433   case vect_simple_var:
4434     prefix = "vect";
4435     break;
4436   case vect_scalar_var:
4437     prefix = "stmp";
4438     break;
4439   case vect_pointer_var:
4440     prefix = "vectp";
4441     break;
4442   default:
4443     gcc_unreachable ();
4444   }
4445
4446   if (name)
4447     {
4448       char* tmp = concat (prefix, "_", name, NULL);
4449       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4450       free (tmp);
4451     }
4452   else
4453     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4454
4455   return new_vect_var;
4456 }
4457
4458 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4459
4460 static void
4461 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4462 {
4463   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4464   int misalign = DR_MISALIGNMENT (dr);
4465   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4466     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4467   else
4468     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4469                             DR_TARGET_ALIGNMENT (dr), misalign);
4470 }
4471
4472 /* Function vect_create_addr_base_for_vector_ref.
4473
4474    Create an expression that computes the address of the first memory location
4475    that will be accessed for a data reference.
4476
4477    Input:
4478    STMT: The statement containing the data reference.
4479    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4480    OFFSET: Optional. If supplied, it is be added to the initial address.
4481    LOOP:    Specify relative to which loop-nest should the address be computed.
4482             For example, when the dataref is in an inner-loop nested in an
4483             outer-loop that is now being vectorized, LOOP can be either the
4484             outer-loop, or the inner-loop.  The first memory location accessed
4485             by the following dataref ('in' points to short):
4486
4487                 for (i=0; i<N; i++)
4488                    for (j=0; j<M; j++)
4489                      s += in[i+j]
4490
4491             is as follows:
4492             if LOOP=i_loop:     &in             (relative to i_loop)
4493             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4494    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4495             initial address.  Unlike OFFSET, which is number of elements to
4496             be added, BYTE_OFFSET is measured in bytes.
4497
4498    Output:
4499    1. Return an SSA_NAME whose value is the address of the memory location of
4500       the first vector of the data reference.
4501    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4502       these statement(s) which define the returned SSA_NAME.
4503
4504    FORNOW: We are only handling array accesses with step 1.  */
4505
4506 tree
4507 vect_create_addr_base_for_vector_ref (gimple *stmt,
4508                                       gimple_seq *new_stmt_list,
4509                                       tree offset,
4510                                       tree byte_offset)
4511 {
4512   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4513   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4514   const char *base_name;
4515   tree addr_base;
4516   tree dest;
4517   gimple_seq seq = NULL;
4518   tree vect_ptr_type;
4519   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4520   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4521   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4522
4523   tree data_ref_base = unshare_expr (drb->base_address);
4524   tree base_offset = unshare_expr (drb->offset);
4525   tree init = unshare_expr (drb->init);
4526
4527   if (loop_vinfo)
4528     base_name = get_name (data_ref_base);
4529   else
4530     {
4531       base_offset = ssize_int (0);
4532       init = ssize_int (0);
4533       base_name = get_name (DR_REF (dr));
4534     }
4535
4536   /* Create base_offset */
4537   base_offset = size_binop (PLUS_EXPR,
4538                             fold_convert (sizetype, base_offset),
4539                             fold_convert (sizetype, init));
4540
4541   if (offset)
4542     {
4543       offset = fold_build2 (MULT_EXPR, sizetype,
4544                             fold_convert (sizetype, offset), step);
4545       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4546                                  base_offset, offset);
4547     }
4548   if (byte_offset)
4549     {
4550       byte_offset = fold_convert (sizetype, byte_offset);
4551       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4552                                  base_offset, byte_offset);
4553     }
4554
4555   /* base + base_offset */
4556   if (loop_vinfo)
4557     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4558   else
4559     {
4560       addr_base = build1 (ADDR_EXPR,
4561                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4562                           unshare_expr (DR_REF (dr)));
4563     }
4564
4565   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4566   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4567   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4568   gimple_seq_add_seq (new_stmt_list, seq);
4569
4570   if (DR_PTR_INFO (dr)
4571       && TREE_CODE (addr_base) == SSA_NAME
4572       && !SSA_NAME_PTR_INFO (addr_base))
4573     {
4574       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4575       if (offset || byte_offset)
4576         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4577     }
4578
4579   if (dump_enabled_p ())
4580     {
4581       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4582       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4583       dump_printf (MSG_NOTE, "\n");
4584     }
4585
4586   return addr_base;
4587 }
4588
4589
4590 /* Function vect_create_data_ref_ptr.
4591
4592    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4593    location accessed in the loop by STMT, along with the def-use update
4594    chain to appropriately advance the pointer through the loop iterations.
4595    Also set aliasing information for the pointer.  This pointer is used by
4596    the callers to this function to create a memory reference expression for
4597    vector load/store access.
4598
4599    Input:
4600    1. STMT: a stmt that references memory. Expected to be of the form
4601          GIMPLE_ASSIGN <name, data-ref> or
4602          GIMPLE_ASSIGN <data-ref, name>.
4603    2. AGGR_TYPE: the type of the reference, which should be either a vector
4604         or an array.
4605    3. AT_LOOP: the loop where the vector memref is to be created.
4606    4. OFFSET (optional): an offset to be added to the initial address accessed
4607         by the data-ref in STMT.
4608    5. BSI: location where the new stmts are to be placed if there is no loop
4609    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4610         pointing to the initial address.
4611    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4612         to the initial address accessed by the data-ref in STMT.  This is
4613         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4614         in bytes.
4615    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4616         to the IV during each iteration of the loop.  NULL says to move
4617         by one copy of AGGR_TYPE up or down, depending on the step of the
4618         data reference.
4619
4620    Output:
4621    1. Declare a new ptr to vector_type, and have it point to the base of the
4622       data reference (initial addressed accessed by the data reference).
4623       For example, for vector of type V8HI, the following code is generated:
4624
4625       v8hi *ap;
4626       ap = (v8hi *)initial_address;
4627
4628       if OFFSET is not supplied:
4629          initial_address = &a[init];
4630       if OFFSET is supplied:
4631          initial_address = &a[init + OFFSET];
4632       if BYTE_OFFSET is supplied:
4633          initial_address = &a[init] + BYTE_OFFSET;
4634
4635       Return the initial_address in INITIAL_ADDRESS.
4636
4637    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4638       update the pointer in each iteration of the loop.
4639
4640       Return the increment stmt that updates the pointer in PTR_INCR.
4641
4642    3. Set INV_P to true if the access pattern of the data reference in the
4643       vectorized loop is invariant.  Set it to false otherwise.
4644
4645    4. Return the pointer.  */
4646
4647 tree
4648 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4649                           tree offset, tree *initial_address,
4650                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4651                           bool only_init, bool *inv_p, tree byte_offset,
4652                           tree iv_step)
4653 {
4654   const char *base_name;
4655   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4656   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4657   struct loop *loop = NULL;
4658   bool nested_in_vect_loop = false;
4659   struct loop *containing_loop = NULL;
4660   tree aggr_ptr_type;
4661   tree aggr_ptr;
4662   tree new_temp;
4663   gimple_seq new_stmt_list = NULL;
4664   edge pe = NULL;
4665   basic_block new_bb;
4666   tree aggr_ptr_init;
4667   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4668   tree aptr;
4669   gimple_stmt_iterator incr_gsi;
4670   bool insert_after;
4671   tree indx_before_incr, indx_after_incr;
4672   gimple *incr;
4673   tree step;
4674   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4675
4676   gcc_assert (iv_step != NULL_TREE
4677               || TREE_CODE (aggr_type) == ARRAY_TYPE
4678               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4679
4680   if (loop_vinfo)
4681     {
4682       loop = LOOP_VINFO_LOOP (loop_vinfo);
4683       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4684       containing_loop = (gimple_bb (stmt))->loop_father;
4685       pe = loop_preheader_edge (loop);
4686     }
4687   else
4688     {
4689       gcc_assert (bb_vinfo);
4690       only_init = true;
4691       *ptr_incr = NULL;
4692     }
4693
4694   /* Check the step (evolution) of the load in LOOP, and record
4695      whether it's invariant.  */
4696   step = vect_dr_behavior (dr)->step;
4697   if (integer_zerop (step))
4698     *inv_p = true;
4699   else
4700     *inv_p = false;
4701
4702   /* Create an expression for the first address accessed by this load
4703      in LOOP.  */
4704   base_name = get_name (DR_BASE_ADDRESS (dr));
4705
4706   if (dump_enabled_p ())
4707     {
4708       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4709       dump_printf_loc (MSG_NOTE, vect_location,
4710                        "create %s-pointer variable to type: ",
4711                        get_tree_code_name (TREE_CODE (aggr_type)));
4712       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4713       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4714         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4715       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4716         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4717       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4718         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4719       else
4720         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4721       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4722       dump_printf (MSG_NOTE, "\n");
4723     }
4724
4725   /* (1) Create the new aggregate-pointer variable.
4726      Vector and array types inherit the alias set of their component
4727      type by default so we need to use a ref-all pointer if the data
4728      reference does not conflict with the created aggregated data
4729      reference because it is not addressable.  */
4730   bool need_ref_all = false;
4731   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4732                               get_alias_set (DR_REF (dr))))
4733     need_ref_all = true;
4734   /* Likewise for any of the data references in the stmt group.  */
4735   else if (DR_GROUP_SIZE (stmt_info) > 1)
4736     {
4737       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4738       do
4739         {
4740           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4741           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4742           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4743                                       get_alias_set (DR_REF (sdr))))
4744             {
4745               need_ref_all = true;
4746               break;
4747             }
4748           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4749         }
4750       while (orig_stmt);
4751     }
4752   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4753                                                need_ref_all);
4754   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4755
4756
4757   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4758      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4759      def-use update cycles for the pointer: one relative to the outer-loop
4760      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4761      to the inner-loop (which is the inner-most loop containing the dataref),
4762      and this is done be step (5) below.
4763
4764      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4765      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4766      redundant.  Steps (3),(4) create the following:
4767
4768         vp0 = &base_addr;
4769         LOOP:   vp1 = phi(vp0,vp2)
4770                 ...
4771                 ...
4772                 vp2 = vp1 + step
4773                 goto LOOP
4774
4775      If there is an inner-loop nested in loop, then step (5) will also be
4776      applied, and an additional update in the inner-loop will be created:
4777
4778         vp0 = &base_addr;
4779         LOOP:   vp1 = phi(vp0,vp2)
4780                 ...
4781         inner:     vp3 = phi(vp1,vp4)
4782                    vp4 = vp3 + inner_step
4783                    if () goto inner
4784                 ...
4785                 vp2 = vp1 + step
4786                 if () goto LOOP   */
4787
4788   /* (2) Calculate the initial address of the aggregate-pointer, and set
4789      the aggregate-pointer to point to it before the loop.  */
4790
4791   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4792
4793   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4794                                                    offset, byte_offset);
4795   if (new_stmt_list)
4796     {
4797       if (pe)
4798         {
4799           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4800           gcc_assert (!new_bb);
4801         }
4802       else
4803         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4804     }
4805
4806   *initial_address = new_temp;
4807   aggr_ptr_init = new_temp;
4808
4809   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4810      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4811      inner-loop nested in LOOP (during outer-loop vectorization).  */
4812
4813   /* No update in loop is required.  */
4814   if (only_init && (!loop_vinfo || at_loop == loop))
4815     aptr = aggr_ptr_init;
4816   else
4817     {
4818       if (iv_step == NULL_TREE)
4819         {
4820           /* The step of the aggregate pointer is the type size.  */
4821           iv_step = TYPE_SIZE_UNIT (aggr_type);
4822           /* One exception to the above is when the scalar step of the load in
4823              LOOP is zero. In this case the step here is also zero.  */
4824           if (*inv_p)
4825             iv_step = size_zero_node;
4826           else if (tree_int_cst_sgn (step) == -1)
4827             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4828         }
4829
4830       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4831
4832       create_iv (aggr_ptr_init,
4833                  fold_convert (aggr_ptr_type, iv_step),
4834                  aggr_ptr, loop, &incr_gsi, insert_after,
4835                  &indx_before_incr, &indx_after_incr);
4836       incr = gsi_stmt (incr_gsi);
4837       loop_vinfo->add_stmt (incr);
4838
4839       /* Copy the points-to information if it exists. */
4840       if (DR_PTR_INFO (dr))
4841         {
4842           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4843           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4844         }
4845       if (ptr_incr)
4846         *ptr_incr = incr;
4847
4848       aptr = indx_before_incr;
4849     }
4850
4851   if (!nested_in_vect_loop || only_init)
4852     return aptr;
4853
4854
4855   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4856      nested in LOOP, if exists.  */
4857
4858   gcc_assert (nested_in_vect_loop);
4859   if (!only_init)
4860     {
4861       standard_iv_increment_position (containing_loop, &incr_gsi,
4862                                       &insert_after);
4863       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4864                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4865                  &indx_after_incr);
4866       incr = gsi_stmt (incr_gsi);
4867       loop_vinfo->add_stmt (incr);
4868
4869       /* Copy the points-to information if it exists. */
4870       if (DR_PTR_INFO (dr))
4871         {
4872           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4873           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4874         }
4875       if (ptr_incr)
4876         *ptr_incr = incr;
4877
4878       return indx_before_incr;
4879     }
4880   else
4881     gcc_unreachable ();
4882 }
4883
4884
4885 /* Function bump_vector_ptr
4886
4887    Increment a pointer (to a vector type) by vector-size. If requested,
4888    i.e. if PTR-INCR is given, then also connect the new increment stmt
4889    to the existing def-use update-chain of the pointer, by modifying
4890    the PTR_INCR as illustrated below:
4891
4892    The pointer def-use update-chain before this function:
4893                         DATAREF_PTR = phi (p_0, p_2)
4894                         ....
4895         PTR_INCR:       p_2 = DATAREF_PTR + step
4896
4897    The pointer def-use update-chain after this function:
4898                         DATAREF_PTR = phi (p_0, p_2)
4899                         ....
4900                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4901                         ....
4902         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4903
4904    Input:
4905    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4906                  in the loop.
4907    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4908               the loop.  The increment amount across iterations is expected
4909               to be vector_size.
4910    BSI - location where the new update stmt is to be placed.
4911    STMT - the original scalar memory-access stmt that is being vectorized.
4912    BUMP - optional. The offset by which to bump the pointer. If not given,
4913           the offset is assumed to be vector_size.
4914
4915    Output: Return NEW_DATAREF_PTR as illustrated above.
4916
4917 */
4918
4919 tree
4920 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4921                  gimple *stmt, tree bump)
4922 {
4923   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4924   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4925   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4926   tree update = TYPE_SIZE_UNIT (vectype);
4927   gassign *incr_stmt;
4928   ssa_op_iter iter;
4929   use_operand_p use_p;
4930   tree new_dataref_ptr;
4931
4932   if (bump)
4933     update = bump;
4934
4935   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4936     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4937   else
4938     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4939   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4940                                    dataref_ptr, update);
4941   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4942
4943   /* Copy the points-to information if it exists. */
4944   if (DR_PTR_INFO (dr))
4945     {
4946       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4947       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4948     }
4949
4950   if (!ptr_incr)
4951     return new_dataref_ptr;
4952
4953   /* Update the vector-pointer's cross-iteration increment.  */
4954   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4955     {
4956       tree use = USE_FROM_PTR (use_p);
4957
4958       if (use == dataref_ptr)
4959         SET_USE (use_p, new_dataref_ptr);
4960       else
4961         gcc_assert (operand_equal_p (use, update, 0));
4962     }
4963
4964   return new_dataref_ptr;
4965 }
4966
4967
4968 /* Copy memory reference info such as base/clique from the SRC reference
4969    to the DEST MEM_REF.  */
4970
4971 void
4972 vect_copy_ref_info (tree dest, tree src)
4973 {
4974   if (TREE_CODE (dest) != MEM_REF)
4975     return;
4976
4977   tree src_base = src;
4978   while (handled_component_p (src_base))
4979     src_base = TREE_OPERAND (src_base, 0);
4980   if (TREE_CODE (src_base) != MEM_REF
4981       && TREE_CODE (src_base) != TARGET_MEM_REF)
4982     return;
4983
4984   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4985   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4986 }
4987
4988
4989 /* Function vect_create_destination_var.
4990
4991    Create a new temporary of type VECTYPE.  */
4992
4993 tree
4994 vect_create_destination_var (tree scalar_dest, tree vectype)
4995 {
4996   tree vec_dest;
4997   const char *name;
4998   char *new_name;
4999   tree type;
5000   enum vect_var_kind kind;
5001
5002   kind = vectype
5003     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5004     ? vect_mask_var
5005     : vect_simple_var
5006     : vect_scalar_var;
5007   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5008
5009   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5010
5011   name = get_name (scalar_dest);
5012   if (name)
5013     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5014   else
5015     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5016   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5017   free (new_name);
5018
5019   return vec_dest;
5020 }
5021
5022 /* Function vect_grouped_store_supported.
5023
5024    Returns TRUE if interleave high and interleave low permutations
5025    are supported, and FALSE otherwise.  */
5026
5027 bool
5028 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5029 {
5030   machine_mode mode = TYPE_MODE (vectype);
5031
5032   /* vect_permute_store_chain requires the group size to be equal to 3 or
5033      be a power of two.  */
5034   if (count != 3 && exact_log2 (count) == -1)
5035     {
5036       if (dump_enabled_p ())
5037         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5038                          "the size of the group of accesses"
5039                          " is not a power of 2 or not eqaul to 3\n");
5040       return false;
5041     }
5042
5043   /* Check that the permutation is supported.  */
5044   if (VECTOR_MODE_P (mode))
5045     {
5046       unsigned int i;
5047       if (count == 3)
5048         {
5049           unsigned int j0 = 0, j1 = 0, j2 = 0;
5050           unsigned int i, j;
5051
5052           unsigned int nelt;
5053           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5054             {
5055               if (dump_enabled_p ())
5056                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5057                                  "cannot handle groups of 3 stores for"
5058                                  " variable-length vectors\n");
5059               return false;
5060             }
5061
5062           vec_perm_builder sel (nelt, nelt, 1);
5063           sel.quick_grow (nelt);
5064           vec_perm_indices indices;
5065           for (j = 0; j < 3; j++)
5066             {
5067               int nelt0 = ((3 - j) * nelt) % 3;
5068               int nelt1 = ((3 - j) * nelt + 1) % 3;
5069               int nelt2 = ((3 - j) * nelt + 2) % 3;
5070               for (i = 0; i < nelt; i++)
5071                 {
5072                   if (3 * i + nelt0 < nelt)
5073                     sel[3 * i + nelt0] = j0++;
5074                   if (3 * i + nelt1 < nelt)
5075                     sel[3 * i + nelt1] = nelt + j1++;
5076                   if (3 * i + nelt2 < nelt)
5077                     sel[3 * i + nelt2] = 0;
5078                 }
5079               indices.new_vector (sel, 2, nelt);
5080               if (!can_vec_perm_const_p (mode, indices))
5081                 {
5082                   if (dump_enabled_p ())
5083                     dump_printf (MSG_MISSED_OPTIMIZATION,
5084                                  "permutation op not supported by target.\n");
5085                   return false;
5086                 }
5087
5088               for (i = 0; i < nelt; i++)
5089                 {
5090                   if (3 * i + nelt0 < nelt)
5091                     sel[3 * i + nelt0] = 3 * i + nelt0;
5092                   if (3 * i + nelt1 < nelt)
5093                     sel[3 * i + nelt1] = 3 * i + nelt1;
5094                   if (3 * i + nelt2 < nelt)
5095                     sel[3 * i + nelt2] = nelt + j2++;
5096                 }
5097               indices.new_vector (sel, 2, nelt);
5098               if (!can_vec_perm_const_p (mode, indices))
5099                 {
5100                   if (dump_enabled_p ())
5101                     dump_printf (MSG_MISSED_OPTIMIZATION,
5102                                  "permutation op not supported by target.\n");
5103                   return false;
5104                 }
5105             }
5106           return true;
5107         }
5108       else
5109         {
5110           /* If length is not equal to 3 then only power of 2 is supported.  */
5111           gcc_assert (pow2p_hwi (count));
5112           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5113
5114           /* The encoding has 2 interleaved stepped patterns.  */
5115           vec_perm_builder sel (nelt, 2, 3);
5116           sel.quick_grow (6);
5117           for (i = 0; i < 3; i++)
5118             {
5119               sel[i * 2] = i;
5120               sel[i * 2 + 1] = i + nelt;
5121             }
5122           vec_perm_indices indices (sel, 2, nelt);
5123           if (can_vec_perm_const_p (mode, indices))
5124             {
5125               for (i = 0; i < 6; i++)
5126                 sel[i] += exact_div (nelt, 2);
5127               indices.new_vector (sel, 2, nelt);
5128               if (can_vec_perm_const_p (mode, indices))
5129                 return true;
5130             }
5131         }
5132     }
5133
5134   if (dump_enabled_p ())
5135     dump_printf (MSG_MISSED_OPTIMIZATION,
5136                  "permutaion op not supported by target.\n");
5137   return false;
5138 }
5139
5140
5141 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5142    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5143
5144 bool
5145 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5146                             bool masked_p)
5147 {
5148   if (masked_p)
5149     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5150                                          vec_mask_store_lanes_optab,
5151                                          vectype, count);
5152   else
5153     return vect_lanes_optab_supported_p ("vec_store_lanes",
5154                                          vec_store_lanes_optab,
5155                                          vectype, count);
5156 }
5157
5158
5159 /* Function vect_permute_store_chain.
5160
5161    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5162    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5163    the data correctly for the stores.  Return the final references for stores
5164    in RESULT_CHAIN.
5165
5166    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5167    The input is 4 vectors each containing 8 elements.  We assign a number to
5168    each element, the input sequence is:
5169
5170    1st vec:   0  1  2  3  4  5  6  7
5171    2nd vec:   8  9 10 11 12 13 14 15
5172    3rd vec:  16 17 18 19 20 21 22 23
5173    4th vec:  24 25 26 27 28 29 30 31
5174
5175    The output sequence should be:
5176
5177    1st vec:  0  8 16 24  1  9 17 25
5178    2nd vec:  2 10 18 26  3 11 19 27
5179    3rd vec:  4 12 20 28  5 13 21 30
5180    4th vec:  6 14 22 30  7 15 23 31
5181
5182    i.e., we interleave the contents of the four vectors in their order.
5183
5184    We use interleave_high/low instructions to create such output.  The input of
5185    each interleave_high/low operation is two vectors:
5186    1st vec    2nd vec
5187    0 1 2 3    4 5 6 7
5188    the even elements of the result vector are obtained left-to-right from the
5189    high/low elements of the first vector.  The odd elements of the result are
5190    obtained left-to-right from the high/low elements of the second vector.
5191    The output of interleave_high will be:   0 4 1 5
5192    and of interleave_low:                   2 6 3 7
5193
5194
5195    The permutation is done in log LENGTH stages.  In each stage interleave_high
5196    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5197    where the first argument is taken from the first half of DR_CHAIN and the
5198    second argument from it's second half.
5199    In our example,
5200
5201    I1: interleave_high (1st vec, 3rd vec)
5202    I2: interleave_low (1st vec, 3rd vec)
5203    I3: interleave_high (2nd vec, 4th vec)
5204    I4: interleave_low (2nd vec, 4th vec)
5205
5206    The output for the first stage is:
5207
5208    I1:  0 16  1 17  2 18  3 19
5209    I2:  4 20  5 21  6 22  7 23
5210    I3:  8 24  9 25 10 26 11 27
5211    I4: 12 28 13 29 14 30 15 31
5212
5213    The output of the second stage, i.e. the final result is:
5214
5215    I1:  0  8 16 24  1  9 17 25
5216    I2:  2 10 18 26  3 11 19 27
5217    I3:  4 12 20 28  5 13 21 30
5218    I4:  6 14 22 30  7 15 23 31.  */
5219
5220 void
5221 vect_permute_store_chain (vec<tree> dr_chain,
5222                           unsigned int length,
5223                           gimple *stmt,
5224                           gimple_stmt_iterator *gsi,
5225                           vec<tree> *result_chain)
5226 {
5227   tree vect1, vect2, high, low;
5228   gimple *perm_stmt;
5229   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5230   tree perm_mask_low, perm_mask_high;
5231   tree data_ref;
5232   tree perm3_mask_low, perm3_mask_high;
5233   unsigned int i, j, n, log_length = exact_log2 (length);
5234
5235   result_chain->quick_grow (length);
5236   memcpy (result_chain->address (), dr_chain.address (),
5237           length * sizeof (tree));
5238
5239   if (length == 3)
5240     {
5241       /* vect_grouped_store_supported ensures that this is constant.  */
5242       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5243       unsigned int j0 = 0, j1 = 0, j2 = 0;
5244
5245       vec_perm_builder sel (nelt, nelt, 1);
5246       sel.quick_grow (nelt);
5247       vec_perm_indices indices;
5248       for (j = 0; j < 3; j++)
5249         {
5250           int nelt0 = ((3 - j) * nelt) % 3;
5251           int nelt1 = ((3 - j) * nelt + 1) % 3;
5252           int nelt2 = ((3 - j) * nelt + 2) % 3;
5253
5254           for (i = 0; i < nelt; i++)
5255             {
5256               if (3 * i + nelt0 < nelt)
5257                 sel[3 * i + nelt0] = j0++;
5258               if (3 * i + nelt1 < nelt)
5259                 sel[3 * i + nelt1] = nelt + j1++;
5260               if (3 * i + nelt2 < nelt)
5261                 sel[3 * i + nelt2] = 0;
5262             }
5263           indices.new_vector (sel, 2, nelt);
5264           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5265
5266           for (i = 0; i < nelt; i++)
5267             {
5268               if (3 * i + nelt0 < nelt)
5269                 sel[3 * i + nelt0] = 3 * i + nelt0;
5270               if (3 * i + nelt1 < nelt)
5271                 sel[3 * i + nelt1] = 3 * i + nelt1;
5272               if (3 * i + nelt2 < nelt)
5273                 sel[3 * i + nelt2] = nelt + j2++;
5274             }
5275           indices.new_vector (sel, 2, nelt);
5276           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5277
5278           vect1 = dr_chain[0];
5279           vect2 = dr_chain[1];
5280
5281           /* Create interleaving stmt:
5282              low = VEC_PERM_EXPR <vect1, vect2,
5283                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5284                                    j + 2, nelt + j + 2, *, ...}>  */
5285           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5286           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5287                                            vect2, perm3_mask_low);
5288           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5289
5290           vect1 = data_ref;
5291           vect2 = dr_chain[2];
5292           /* Create interleaving stmt:
5293              low = VEC_PERM_EXPR <vect1, vect2,
5294                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5295                                    6, 7, nelt + j + 2, ...}>  */
5296           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5297           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5298                                            vect2, perm3_mask_high);
5299           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5300           (*result_chain)[j] = data_ref;
5301         }
5302     }
5303   else
5304     {
5305       /* If length is not equal to 3 then only power of 2 is supported.  */
5306       gcc_assert (pow2p_hwi (length));
5307
5308       /* The encoding has 2 interleaved stepped patterns.  */
5309       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5310       vec_perm_builder sel (nelt, 2, 3);
5311       sel.quick_grow (6);
5312       for (i = 0; i < 3; i++)
5313         {
5314           sel[i * 2] = i;
5315           sel[i * 2 + 1] = i + nelt;
5316         }
5317         vec_perm_indices indices (sel, 2, nelt);
5318         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5319
5320         for (i = 0; i < 6; i++)
5321           sel[i] += exact_div (nelt, 2);
5322         indices.new_vector (sel, 2, nelt);
5323         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5324
5325         for (i = 0, n = log_length; i < n; i++)
5326           {
5327             for (j = 0; j < length/2; j++)
5328               {
5329                 vect1 = dr_chain[j];
5330                 vect2 = dr_chain[j+length/2];
5331
5332                 /* Create interleaving stmt:
5333                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5334                                                         ...}>  */
5335                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5336                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5337                                                  vect2, perm_mask_high);
5338                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5339                 (*result_chain)[2*j] = high;
5340
5341                 /* Create interleaving stmt:
5342                    low = VEC_PERM_EXPR <vect1, vect2,
5343                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5344                                          ...}>  */
5345                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5346                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5347                                                  vect2, perm_mask_low);
5348                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5349                 (*result_chain)[2*j+1] = low;
5350               }
5351             memcpy (dr_chain.address (), result_chain->address (),
5352                     length * sizeof (tree));
5353           }
5354     }
5355 }
5356
5357 /* Function vect_setup_realignment
5358
5359    This function is called when vectorizing an unaligned load using
5360    the dr_explicit_realign[_optimized] scheme.
5361    This function generates the following code at the loop prolog:
5362
5363       p = initial_addr;
5364    x  msq_init = *(floor(p));   # prolog load
5365       realignment_token = call target_builtin;
5366     loop:
5367    x  msq = phi (msq_init, ---)
5368
5369    The stmts marked with x are generated only for the case of
5370    dr_explicit_realign_optimized.
5371
5372    The code above sets up a new (vector) pointer, pointing to the first
5373    location accessed by STMT, and a "floor-aligned" load using that pointer.
5374    It also generates code to compute the "realignment-token" (if the relevant
5375    target hook was defined), and creates a phi-node at the loop-header bb
5376    whose arguments are the result of the prolog-load (created by this
5377    function) and the result of a load that takes place in the loop (to be
5378    created by the caller to this function).
5379
5380    For the case of dr_explicit_realign_optimized:
5381    The caller to this function uses the phi-result (msq) to create the
5382    realignment code inside the loop, and sets up the missing phi argument,
5383    as follows:
5384     loop:
5385       msq = phi (msq_init, lsq)
5386       lsq = *(floor(p'));        # load in loop
5387       result = realign_load (msq, lsq, realignment_token);
5388
5389    For the case of dr_explicit_realign:
5390     loop:
5391       msq = *(floor(p));        # load in loop
5392       p' = p + (VS-1);
5393       lsq = *(floor(p'));       # load in loop
5394       result = realign_load (msq, lsq, realignment_token);
5395
5396    Input:
5397    STMT - (scalar) load stmt to be vectorized. This load accesses
5398           a memory location that may be unaligned.
5399    BSI - place where new code is to be inserted.
5400    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5401                               is used.
5402
5403    Output:
5404    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5405                        target hook, if defined.
5406    Return value - the result of the loop-header phi node.  */
5407
5408 tree
5409 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5410                         tree *realignment_token,
5411                         enum dr_alignment_support alignment_support_scheme,
5412                         tree init_addr,
5413                         struct loop **at_loop)
5414 {
5415   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5416   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5417   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5418   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5419   struct loop *loop = NULL;
5420   edge pe = NULL;
5421   tree scalar_dest = gimple_assign_lhs (stmt);
5422   tree vec_dest;
5423   gimple *inc;
5424   tree ptr;
5425   tree data_ref;
5426   basic_block new_bb;
5427   tree msq_init = NULL_TREE;
5428   tree new_temp;
5429   gphi *phi_stmt;
5430   tree msq = NULL_TREE;
5431   gimple_seq stmts = NULL;
5432   bool inv_p;
5433   bool compute_in_loop = false;
5434   bool nested_in_vect_loop = false;
5435   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5436   struct loop *loop_for_initial_load = NULL;
5437
5438   if (loop_vinfo)
5439     {
5440       loop = LOOP_VINFO_LOOP (loop_vinfo);
5441       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5442     }
5443
5444   gcc_assert (alignment_support_scheme == dr_explicit_realign
5445               || alignment_support_scheme == dr_explicit_realign_optimized);
5446
5447   /* We need to generate three things:
5448      1. the misalignment computation
5449      2. the extra vector load (for the optimized realignment scheme).
5450      3. the phi node for the two vectors from which the realignment is
5451       done (for the optimized realignment scheme).  */
5452
5453   /* 1. Determine where to generate the misalignment computation.
5454
5455      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5456      calculation will be generated by this function, outside the loop (in the
5457      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5458      caller, inside the loop.
5459
5460      Background: If the misalignment remains fixed throughout the iterations of
5461      the loop, then both realignment schemes are applicable, and also the
5462      misalignment computation can be done outside LOOP.  This is because we are
5463      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5464      are a multiple of VS (the Vector Size), and therefore the misalignment in
5465      different vectorized LOOP iterations is always the same.
5466      The problem arises only if the memory access is in an inner-loop nested
5467      inside LOOP, which is now being vectorized using outer-loop vectorization.
5468      This is the only case when the misalignment of the memory access may not
5469      remain fixed throughout the iterations of the inner-loop (as explained in
5470      detail in vect_supportable_dr_alignment).  In this case, not only is the
5471      optimized realignment scheme not applicable, but also the misalignment
5472      computation (and generation of the realignment token that is passed to
5473      REALIGN_LOAD) have to be done inside the loop.
5474
5475      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5476      or not, which in turn determines if the misalignment is computed inside
5477      the inner-loop, or outside LOOP.  */
5478
5479   if (init_addr != NULL_TREE || !loop_vinfo)
5480     {
5481       compute_in_loop = true;
5482       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5483     }
5484
5485
5486   /* 2. Determine where to generate the extra vector load.
5487
5488      For the optimized realignment scheme, instead of generating two vector
5489      loads in each iteration, we generate a single extra vector load in the
5490      preheader of the loop, and in each iteration reuse the result of the
5491      vector load from the previous iteration.  In case the memory access is in
5492      an inner-loop nested inside LOOP, which is now being vectorized using
5493      outer-loop vectorization, we need to determine whether this initial vector
5494      load should be generated at the preheader of the inner-loop, or can be
5495      generated at the preheader of LOOP.  If the memory access has no evolution
5496      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5497      to be generated inside LOOP (in the preheader of the inner-loop).  */
5498
5499   if (nested_in_vect_loop)
5500     {
5501       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5502       bool invariant_in_outerloop =
5503             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5504       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5505     }
5506   else
5507     loop_for_initial_load = loop;
5508   if (at_loop)
5509     *at_loop = loop_for_initial_load;
5510
5511   if (loop_for_initial_load)
5512     pe = loop_preheader_edge (loop_for_initial_load);
5513
5514   /* 3. For the case of the optimized realignment, create the first vector
5515       load at the loop preheader.  */
5516
5517   if (alignment_support_scheme == dr_explicit_realign_optimized)
5518     {
5519       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5520       gassign *new_stmt;
5521
5522       gcc_assert (!compute_in_loop);
5523       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5524       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5525                                       NULL_TREE, &init_addr, NULL, &inc,
5526                                       true, &inv_p);
5527       if (TREE_CODE (ptr) == SSA_NAME)
5528         new_temp = copy_ssa_name (ptr);
5529       else
5530         new_temp = make_ssa_name (TREE_TYPE (ptr));
5531       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5532       new_stmt = gimple_build_assign
5533                    (new_temp, BIT_AND_EXPR, ptr,
5534                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5535       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5536       gcc_assert (!new_bb);
5537       data_ref
5538         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5539                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5540       vect_copy_ref_info (data_ref, DR_REF (dr));
5541       new_stmt = gimple_build_assign (vec_dest, data_ref);
5542       new_temp = make_ssa_name (vec_dest, new_stmt);
5543       gimple_assign_set_lhs (new_stmt, new_temp);
5544       if (pe)
5545         {
5546           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5547           gcc_assert (!new_bb);
5548         }
5549       else
5550          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5551
5552       msq_init = gimple_assign_lhs (new_stmt);
5553     }
5554
5555   /* 4. Create realignment token using a target builtin, if available.
5556       It is done either inside the containing loop, or before LOOP (as
5557       determined above).  */
5558
5559   if (targetm.vectorize.builtin_mask_for_load)
5560     {
5561       gcall *new_stmt;
5562       tree builtin_decl;
5563
5564       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5565       if (!init_addr)
5566         {
5567           /* Generate the INIT_ADDR computation outside LOOP.  */
5568           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5569                                                             NULL_TREE);
5570           if (loop)
5571             {
5572               pe = loop_preheader_edge (loop);
5573               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5574               gcc_assert (!new_bb);
5575             }
5576           else
5577              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5578         }
5579
5580       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5581       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5582       vec_dest =
5583         vect_create_destination_var (scalar_dest,
5584                                      gimple_call_return_type (new_stmt));
5585       new_temp = make_ssa_name (vec_dest, new_stmt);
5586       gimple_call_set_lhs (new_stmt, new_temp);
5587
5588       if (compute_in_loop)
5589         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5590       else
5591         {
5592           /* Generate the misalignment computation outside LOOP.  */
5593           pe = loop_preheader_edge (loop);
5594           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5595           gcc_assert (!new_bb);
5596         }
5597
5598       *realignment_token = gimple_call_lhs (new_stmt);
5599
5600       /* The result of the CALL_EXPR to this builtin is determined from
5601          the value of the parameter and no global variables are touched
5602          which makes the builtin a "const" function.  Requiring the
5603          builtin to have the "const" attribute makes it unnecessary
5604          to call mark_call_clobbered.  */
5605       gcc_assert (TREE_READONLY (builtin_decl));
5606     }
5607
5608   if (alignment_support_scheme == dr_explicit_realign)
5609     return msq;
5610
5611   gcc_assert (!compute_in_loop);
5612   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5613
5614
5615   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5616
5617   pe = loop_preheader_edge (containing_loop);
5618   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5619   msq = make_ssa_name (vec_dest);
5620   phi_stmt = create_phi_node (msq, containing_loop->header);
5621   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5622
5623   return msq;
5624 }
5625
5626
5627 /* Function vect_grouped_load_supported.
5628
5629    COUNT is the size of the load group (the number of statements plus the
5630    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5631    only one statement, with a gap of COUNT - 1.
5632
5633    Returns true if a suitable permute exists.  */
5634
5635 bool
5636 vect_grouped_load_supported (tree vectype, bool single_element_p,
5637                              unsigned HOST_WIDE_INT count)
5638 {
5639   machine_mode mode = TYPE_MODE (vectype);
5640
5641   /* If this is single-element interleaving with an element distance
5642      that leaves unused vector loads around punt - we at least create
5643      very sub-optimal code in that case (and blow up memory,
5644      see PR65518).  */
5645   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5646     {
5647       if (dump_enabled_p ())
5648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5649                          "single-element interleaving not supported "
5650                          "for not adjacent vector loads\n");
5651       return false;
5652     }
5653
5654   /* vect_permute_load_chain requires the group size to be equal to 3 or
5655      be a power of two.  */
5656   if (count != 3 && exact_log2 (count) == -1)
5657     {
5658       if (dump_enabled_p ())
5659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5660                          "the size of the group of accesses"
5661                          " is not a power of 2 or not equal to 3\n");
5662       return false;
5663     }
5664
5665   /* Check that the permutation is supported.  */
5666   if (VECTOR_MODE_P (mode))
5667     {
5668       unsigned int i, j;
5669       if (count == 3)
5670         {
5671           unsigned int nelt;
5672           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5673             {
5674               if (dump_enabled_p ())
5675                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5676                                  "cannot handle groups of 3 loads for"
5677                                  " variable-length vectors\n");
5678               return false;
5679             }
5680
5681           vec_perm_builder sel (nelt, nelt, 1);
5682           sel.quick_grow (nelt);
5683           vec_perm_indices indices;
5684           unsigned int k;
5685           for (k = 0; k < 3; k++)
5686             {
5687               for (i = 0; i < nelt; i++)
5688                 if (3 * i + k < 2 * nelt)
5689                   sel[i] = 3 * i + k;
5690                 else
5691                   sel[i] = 0;
5692               indices.new_vector (sel, 2, nelt);
5693               if (!can_vec_perm_const_p (mode, indices))
5694                 {
5695                   if (dump_enabled_p ())
5696                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5697                                      "shuffle of 3 loads is not supported by"
5698                                      " target\n");
5699                   return false;
5700                 }
5701               for (i = 0, j = 0; i < nelt; i++)
5702                 if (3 * i + k < 2 * nelt)
5703                   sel[i] = i;
5704                 else
5705                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5706               indices.new_vector (sel, 2, nelt);
5707               if (!can_vec_perm_const_p (mode, indices))
5708                 {
5709                   if (dump_enabled_p ())
5710                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5711                                      "shuffle of 3 loads is not supported by"
5712                                      " target\n");
5713                   return false;
5714                 }
5715             }
5716           return true;
5717         }
5718       else
5719         {
5720           /* If length is not equal to 3 then only power of 2 is supported.  */
5721           gcc_assert (pow2p_hwi (count));
5722           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5723
5724           /* The encoding has a single stepped pattern.  */
5725           vec_perm_builder sel (nelt, 1, 3);
5726           sel.quick_grow (3);
5727           for (i = 0; i < 3; i++)
5728             sel[i] = i * 2;
5729           vec_perm_indices indices (sel, 2, nelt);
5730           if (can_vec_perm_const_p (mode, indices))
5731             {
5732               for (i = 0; i < 3; i++)
5733                 sel[i] = i * 2 + 1;
5734               indices.new_vector (sel, 2, nelt);
5735               if (can_vec_perm_const_p (mode, indices))
5736                 return true;
5737             }
5738         }
5739     }
5740
5741   if (dump_enabled_p ())
5742     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5743                      "extract even/odd not supported by target\n");
5744   return false;
5745 }
5746
5747 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5748    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5749
5750 bool
5751 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5752                            bool masked_p)
5753 {
5754   if (masked_p)
5755     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5756                                          vec_mask_load_lanes_optab,
5757                                          vectype, count);
5758   else
5759     return vect_lanes_optab_supported_p ("vec_load_lanes",
5760                                          vec_load_lanes_optab,
5761                                          vectype, count);
5762 }
5763
5764 /* Function vect_permute_load_chain.
5765
5766    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5767    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5768    the input data correctly.  Return the final references for loads in
5769    RESULT_CHAIN.
5770
5771    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5772    The input is 4 vectors each containing 8 elements. We assign a number to each
5773    element, the input sequence is:
5774
5775    1st vec:   0  1  2  3  4  5  6  7
5776    2nd vec:   8  9 10 11 12 13 14 15
5777    3rd vec:  16 17 18 19 20 21 22 23
5778    4th vec:  24 25 26 27 28 29 30 31
5779
5780    The output sequence should be:
5781
5782    1st vec:  0 4  8 12 16 20 24 28
5783    2nd vec:  1 5  9 13 17 21 25 29
5784    3rd vec:  2 6 10 14 18 22 26 30
5785    4th vec:  3 7 11 15 19 23 27 31
5786
5787    i.e., the first output vector should contain the first elements of each
5788    interleaving group, etc.
5789
5790    We use extract_even/odd instructions to create such output.  The input of
5791    each extract_even/odd operation is two vectors
5792    1st vec    2nd vec
5793    0 1 2 3    4 5 6 7
5794
5795    and the output is the vector of extracted even/odd elements.  The output of
5796    extract_even will be:   0 2 4 6
5797    and of extract_odd:     1 3 5 7
5798
5799
5800    The permutation is done in log LENGTH stages.  In each stage extract_even
5801    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5802    their order.  In our example,
5803
5804    E1: extract_even (1st vec, 2nd vec)
5805    E2: extract_odd (1st vec, 2nd vec)
5806    E3: extract_even (3rd vec, 4th vec)
5807    E4: extract_odd (3rd vec, 4th vec)
5808
5809    The output for the first stage will be:
5810
5811    E1:  0  2  4  6  8 10 12 14
5812    E2:  1  3  5  7  9 11 13 15
5813    E3: 16 18 20 22 24 26 28 30
5814    E4: 17 19 21 23 25 27 29 31
5815
5816    In order to proceed and create the correct sequence for the next stage (or
5817    for the correct output, if the second stage is the last one, as in our
5818    example), we first put the output of extract_even operation and then the
5819    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5820    The input for the second stage is:
5821
5822    1st vec (E1):  0  2  4  6  8 10 12 14
5823    2nd vec (E3): 16 18 20 22 24 26 28 30
5824    3rd vec (E2):  1  3  5  7  9 11 13 15
5825    4th vec (E4): 17 19 21 23 25 27 29 31
5826
5827    The output of the second stage:
5828
5829    E1: 0 4  8 12 16 20 24 28
5830    E2: 2 6 10 14 18 22 26 30
5831    E3: 1 5  9 13 17 21 25 29
5832    E4: 3 7 11 15 19 23 27 31
5833
5834    And RESULT_CHAIN after reordering:
5835
5836    1st vec (E1):  0 4  8 12 16 20 24 28
5837    2nd vec (E3):  1 5  9 13 17 21 25 29
5838    3rd vec (E2):  2 6 10 14 18 22 26 30
5839    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5840
5841 static void
5842 vect_permute_load_chain (vec<tree> dr_chain,
5843                          unsigned int length,
5844                          gimple *stmt,
5845                          gimple_stmt_iterator *gsi,
5846                          vec<tree> *result_chain)
5847 {
5848   tree data_ref, first_vect, second_vect;
5849   tree perm_mask_even, perm_mask_odd;
5850   tree perm3_mask_low, perm3_mask_high;
5851   gimple *perm_stmt;
5852   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5853   unsigned int i, j, log_length = exact_log2 (length);
5854
5855   result_chain->quick_grow (length);
5856   memcpy (result_chain->address (), dr_chain.address (),
5857           length * sizeof (tree));
5858
5859   if (length == 3)
5860     {
5861       /* vect_grouped_load_supported ensures that this is constant.  */
5862       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5863       unsigned int k;
5864
5865       vec_perm_builder sel (nelt, nelt, 1);
5866       sel.quick_grow (nelt);
5867       vec_perm_indices indices;
5868       for (k = 0; k < 3; k++)
5869         {
5870           for (i = 0; i < nelt; i++)
5871             if (3 * i + k < 2 * nelt)
5872               sel[i] = 3 * i + k;
5873             else
5874               sel[i] = 0;
5875           indices.new_vector (sel, 2, nelt);
5876           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5877
5878           for (i = 0, j = 0; i < nelt; i++)
5879             if (3 * i + k < 2 * nelt)
5880               sel[i] = i;
5881             else
5882               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5883           indices.new_vector (sel, 2, nelt);
5884           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5885
5886           first_vect = dr_chain[0];
5887           second_vect = dr_chain[1];
5888
5889           /* Create interleaving stmt (low part of):
5890              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5891                                                              ...}>  */
5892           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5893           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5894                                            second_vect, perm3_mask_low);
5895           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5896
5897           /* Create interleaving stmt (high part of):
5898              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5899                                                               ...}>  */
5900           first_vect = data_ref;
5901           second_vect = dr_chain[2];
5902           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5903           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5904                                            second_vect, perm3_mask_high);
5905           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5906           (*result_chain)[k] = data_ref;
5907         }
5908     }
5909   else
5910     {
5911       /* If length is not equal to 3 then only power of 2 is supported.  */
5912       gcc_assert (pow2p_hwi (length));
5913
5914       /* The encoding has a single stepped pattern.  */
5915       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5916       vec_perm_builder sel (nelt, 1, 3);
5917       sel.quick_grow (3);
5918       for (i = 0; i < 3; ++i)
5919         sel[i] = i * 2;
5920       vec_perm_indices indices (sel, 2, nelt);
5921       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5922
5923       for (i = 0; i < 3; ++i)
5924         sel[i] = i * 2 + 1;
5925       indices.new_vector (sel, 2, nelt);
5926       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5927
5928       for (i = 0; i < log_length; i++)
5929         {
5930           for (j = 0; j < length; j += 2)
5931             {
5932               first_vect = dr_chain[j];
5933               second_vect = dr_chain[j+1];
5934
5935               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5936               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5937               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5938                                                first_vect, second_vect,
5939                                                perm_mask_even);
5940               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5941               (*result_chain)[j/2] = data_ref;
5942
5943               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5944               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5945               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5946                                                first_vect, second_vect,
5947                                                perm_mask_odd);
5948               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5949               (*result_chain)[j/2+length/2] = data_ref;
5950             }
5951           memcpy (dr_chain.address (), result_chain->address (),
5952                   length * sizeof (tree));
5953         }
5954     }
5955 }
5956
5957 /* Function vect_shift_permute_load_chain.
5958
5959    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5960    sequence of stmts to reorder the input data accordingly.
5961    Return the final references for loads in RESULT_CHAIN.
5962    Return true if successed, false otherwise.
5963
5964    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5965    The input is 3 vectors each containing 8 elements.  We assign a
5966    number to each element, the input sequence is:
5967
5968    1st vec:   0  1  2  3  4  5  6  7
5969    2nd vec:   8  9 10 11 12 13 14 15
5970    3rd vec:  16 17 18 19 20 21 22 23
5971
5972    The output sequence should be:
5973
5974    1st vec:  0 3 6  9 12 15 18 21
5975    2nd vec:  1 4 7 10 13 16 19 22
5976    3rd vec:  2 5 8 11 14 17 20 23
5977
5978    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5979
5980    First we shuffle all 3 vectors to get correct elements order:
5981
5982    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5983    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5984    3rd vec:  (16 19 22) (17 20 23) (18 21)
5985
5986    Next we unite and shift vector 3 times:
5987
5988    1st step:
5989      shift right by 6 the concatenation of:
5990      "1st vec" and  "2nd vec"
5991        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5992      "2nd vec" and  "3rd vec"
5993        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5994      "3rd vec" and  "1st vec"
5995        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5996                              | New vectors                   |
5997
5998      So that now new vectors are:
5999
6000      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6001      2nd vec:  (10 13) (16 19 22) (17 20 23)
6002      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6003
6004    2nd step:
6005      shift right by 5 the concatenation of:
6006      "1st vec" and  "3rd vec"
6007        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6008      "2nd vec" and  "1st vec"
6009        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6010      "3rd vec" and  "2nd vec"
6011        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6012                           | New vectors                   |
6013
6014      So that now new vectors are:
6015
6016      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6017      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6018      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6019
6020    3rd step:
6021      shift right by 5 the concatenation of:
6022      "1st vec" and  "1st vec"
6023        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6024      shift right by 3 the concatenation of:
6025      "2nd vec" and  "2nd vec"
6026                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6027                           | New vectors                   |
6028
6029      So that now all vectors are READY:
6030      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6031      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6032      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6033
6034    This algorithm is faster than one in vect_permute_load_chain if:
6035      1.  "shift of a concatination" is faster than general permutation.
6036          This is usually so.
6037      2.  The TARGET machine can't execute vector instructions in parallel.
6038          This is because each step of the algorithm depends on previous.
6039          The algorithm in vect_permute_load_chain is much more parallel.
6040
6041    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6042 */
6043
6044 static bool
6045 vect_shift_permute_load_chain (vec<tree> dr_chain,
6046                                unsigned int length,
6047                                gimple *stmt,
6048                                gimple_stmt_iterator *gsi,
6049                                vec<tree> *result_chain)
6050 {
6051   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6052   tree perm2_mask1, perm2_mask2, perm3_mask;
6053   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6054   gimple *perm_stmt;
6055
6056   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6057   unsigned int i;
6058   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6059   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6060
6061   unsigned HOST_WIDE_INT nelt, vf;
6062   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6063       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6064     /* Not supported for variable-length vectors.  */
6065     return false;
6066
6067   vec_perm_builder sel (nelt, nelt, 1);
6068   sel.quick_grow (nelt);
6069
6070   result_chain->quick_grow (length);
6071   memcpy (result_chain->address (), dr_chain.address (),
6072           length * sizeof (tree));
6073
6074   if (pow2p_hwi (length) && vf > 4)
6075     {
6076       unsigned int j, log_length = exact_log2 (length);
6077       for (i = 0; i < nelt / 2; ++i)
6078         sel[i] = i * 2;
6079       for (i = 0; i < nelt / 2; ++i)
6080         sel[nelt / 2 + i] = i * 2 + 1;
6081       vec_perm_indices indices (sel, 2, nelt);
6082       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6083         {
6084           if (dump_enabled_p ())
6085             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6086                              "shuffle of 2 fields structure is not \
6087                               supported by target\n");
6088           return false;
6089         }
6090       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6091
6092       for (i = 0; i < nelt / 2; ++i)
6093         sel[i] = i * 2 + 1;
6094       for (i = 0; i < nelt / 2; ++i)
6095         sel[nelt / 2 + i] = i * 2;
6096       indices.new_vector (sel, 2, nelt);
6097       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6098         {
6099           if (dump_enabled_p ())
6100             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6101                              "shuffle of 2 fields structure is not \
6102                               supported by target\n");
6103           return false;
6104         }
6105       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6106
6107       /* Generating permutation constant to shift all elements.
6108          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6109       for (i = 0; i < nelt; i++)
6110         sel[i] = nelt / 2 + i;
6111       indices.new_vector (sel, 2, nelt);
6112       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6113         {
6114           if (dump_enabled_p ())
6115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                              "shift permutation is not supported by target\n");
6117           return false;
6118         }
6119       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6120
6121       /* Generating permutation constant to select vector from 2.
6122          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6123       for (i = 0; i < nelt / 2; i++)
6124         sel[i] = i;
6125       for (i = nelt / 2; i < nelt; i++)
6126         sel[i] = nelt + i;
6127       indices.new_vector (sel, 2, nelt);
6128       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129         {
6130           if (dump_enabled_p ())
6131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132                              "select is not supported by target\n");
6133           return false;
6134         }
6135       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6136
6137       for (i = 0; i < log_length; i++)
6138         {
6139           for (j = 0; j < length; j += 2)
6140             {
6141               first_vect = dr_chain[j];
6142               second_vect = dr_chain[j + 1];
6143
6144               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6145               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6146                                                first_vect, first_vect,
6147                                                perm2_mask1);
6148               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6149               vect[0] = data_ref;
6150
6151               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6152               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6153                                                second_vect, second_vect,
6154                                                perm2_mask2);
6155               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6156               vect[1] = data_ref;
6157
6158               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6159               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6160                                                vect[0], vect[1], shift1_mask);
6161               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6162               (*result_chain)[j/2 + length/2] = data_ref;
6163
6164               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6165               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6166                                                vect[0], vect[1], select_mask);
6167               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6168               (*result_chain)[j/2] = data_ref;
6169             }
6170           memcpy (dr_chain.address (), result_chain->address (),
6171                   length * sizeof (tree));
6172         }
6173       return true;
6174     }
6175   if (length == 3 && vf > 2)
6176     {
6177       unsigned int k = 0, l = 0;
6178
6179       /* Generating permutation constant to get all elements in rigth order.
6180          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6181       for (i = 0; i < nelt; i++)
6182         {
6183           if (3 * k + (l % 3) >= nelt)
6184             {
6185               k = 0;
6186               l += (3 - (nelt % 3));
6187             }
6188           sel[i] = 3 * k + (l % 3);
6189           k++;
6190         }
6191       vec_perm_indices indices (sel, 2, nelt);
6192       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6193         {
6194           if (dump_enabled_p ())
6195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196                              "shuffle of 3 fields structure is not \
6197                               supported by target\n");
6198           return false;
6199         }
6200       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6201
6202       /* Generating permutation constant to shift all elements.
6203          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6204       for (i = 0; i < nelt; i++)
6205         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6206       indices.new_vector (sel, 2, nelt);
6207       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6208         {
6209           if (dump_enabled_p ())
6210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211                              "shift permutation is not supported by target\n");
6212           return false;
6213         }
6214       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6215
6216       /* Generating permutation constant to shift all elements.
6217          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6218       for (i = 0; i < nelt; i++)
6219         sel[i] = 2 * (nelt / 3) + 1 + i;
6220       indices.new_vector (sel, 2, nelt);
6221       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6222         {
6223           if (dump_enabled_p ())
6224             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6225                              "shift permutation is not supported by target\n");
6226           return false;
6227         }
6228       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6229
6230       /* Generating permutation constant to shift all elements.
6231          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6232       for (i = 0; i < nelt; i++)
6233         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6234       indices.new_vector (sel, 2, nelt);
6235       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6236         {
6237           if (dump_enabled_p ())
6238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6239                              "shift permutation is not supported by target\n");
6240           return false;
6241         }
6242       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6243
6244       /* Generating permutation constant to shift all elements.
6245          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6246       for (i = 0; i < nelt; i++)
6247         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6248       indices.new_vector (sel, 2, nelt);
6249       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6250         {
6251           if (dump_enabled_p ())
6252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253                              "shift permutation is not supported by target\n");
6254           return false;
6255         }
6256       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6257
6258       for (k = 0; k < 3; k++)
6259         {
6260           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6261           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6262                                            dr_chain[k], dr_chain[k],
6263                                            perm3_mask);
6264           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6265           vect[k] = data_ref;
6266         }
6267
6268       for (k = 0; k < 3; k++)
6269         {
6270           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6271           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6272                                            vect[k % 3], vect[(k + 1) % 3],
6273                                            shift1_mask);
6274           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6275           vect_shift[k] = data_ref;
6276         }
6277
6278       for (k = 0; k < 3; k++)
6279         {
6280           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6281           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6282                                            vect_shift[(4 - k) % 3],
6283                                            vect_shift[(3 - k) % 3],
6284                                            shift2_mask);
6285           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6286           vect[k] = data_ref;
6287         }
6288
6289       (*result_chain)[3 - (nelt % 3)] = vect[2];
6290
6291       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6292       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6293                                        vect[0], shift3_mask);
6294       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6295       (*result_chain)[nelt % 3] = data_ref;
6296
6297       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6298       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6299                                        vect[1], shift4_mask);
6300       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301       (*result_chain)[0] = data_ref;
6302       return true;
6303     }
6304   return false;
6305 }
6306
6307 /* Function vect_transform_grouped_load.
6308
6309    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6310    to perform their permutation and ascribe the result vectorized statements to
6311    the scalar statements.
6312 */
6313
6314 void
6315 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6316                              gimple_stmt_iterator *gsi)
6317 {
6318   machine_mode mode;
6319   vec<tree> result_chain = vNULL;
6320
6321   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6322      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6323      vectors, that are ready for vector computation.  */
6324   result_chain.create (size);
6325
6326   /* If reassociation width for vector type is 2 or greater target machine can
6327      execute 2 or more vector instructions in parallel.  Otherwise try to
6328      get chain for loads group using vect_shift_permute_load_chain.  */
6329   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6330   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6331       || pow2p_hwi (size)
6332       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6333                                          gsi, &result_chain))
6334     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6335   vect_record_grouped_load_vectors (stmt, result_chain);
6336   result_chain.release ();
6337 }
6338
6339 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6340    generated as part of the vectorization of STMT.  Assign the statement
6341    for each vector to the associated scalar statement.  */
6342
6343 void
6344 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6345 {
6346   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6347   vec_info *vinfo = stmt_info->vinfo;
6348   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
6349   gimple *next_stmt;
6350   unsigned int i, gap_count;
6351   tree tmp_data_ref;
6352
6353   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6354      Since we scan the chain starting from it's first node, their order
6355      corresponds the order of data-refs in RESULT_CHAIN.  */
6356   next_stmt = first_stmt;
6357   gap_count = 1;
6358   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6359     {
6360       if (!next_stmt)
6361         break;
6362
6363       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6364        code elimination pass later.  No need to check for the first stmt in
6365        the group, since it always exists.
6366        DR_GROUP_GAP is the number of steps in elements from the previous
6367        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6368        correspond to the gaps.  */
6369       if (next_stmt != first_stmt
6370           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6371       {
6372         gap_count++;
6373         continue;
6374       }
6375
6376       while (next_stmt)
6377         {
6378           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6379           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6380              copies, and we put the new vector statement in the first available
6381              RELATED_STMT.  */
6382           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6383             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt_info;
6384           else
6385             {
6386               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6387                 {
6388                   stmt_vec_info prev_stmt_info
6389                     = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6390                   stmt_vec_info rel_stmt_info
6391                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6392                   while (rel_stmt_info)
6393                     {
6394                       prev_stmt_info = rel_stmt_info;
6395                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6396                     }
6397
6398                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6399                 }
6400             }
6401
6402           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6403           gap_count = 1;
6404           /* If NEXT_STMT accesses the same DR as the previous statement,
6405              put the same TMP_DATA_REF as its vectorized statement; otherwise
6406              get the next data-ref from RESULT_CHAIN.  */
6407           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6408             break;
6409         }
6410     }
6411 }
6412
6413 /* Function vect_force_dr_alignment_p.
6414
6415    Returns whether the alignment of a DECL can be forced to be aligned
6416    on ALIGNMENT bit boundary.  */
6417
6418 bool
6419 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6420 {
6421   if (!VAR_P (decl))
6422     return false;
6423
6424   if (decl_in_symtab_p (decl)
6425       && !symtab_node::get (decl)->can_increase_alignment_p ())
6426     return false;
6427
6428   if (TREE_STATIC (decl))
6429     return (alignment <= MAX_OFILE_ALIGNMENT);
6430   else
6431     return (alignment <= MAX_STACK_ALIGNMENT);
6432 }
6433
6434
6435 /* Return whether the data reference DR is supported with respect to its
6436    alignment.
6437    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6438    it is aligned, i.e., check if it is possible to vectorize it with different
6439    alignment.  */
6440
6441 enum dr_alignment_support
6442 vect_supportable_dr_alignment (struct data_reference *dr,
6443                                bool check_aligned_accesses)
6444 {
6445   stmt_vec_info stmt_info = vect_dr_stmt (dr);
6446   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6447   machine_mode mode = TYPE_MODE (vectype);
6448   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6449   struct loop *vect_loop = NULL;
6450   bool nested_in_vect_loop = false;
6451
6452   if (aligned_access_p (dr) && !check_aligned_accesses)
6453     return dr_aligned;
6454
6455   /* For now assume all conditional loads/stores support unaligned
6456      access without any special code.  */
6457   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6458     if (gimple_call_internal_p (stmt)
6459         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6460             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6461       return dr_unaligned_supported;
6462
6463   if (loop_vinfo)
6464     {
6465       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6466       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6467     }
6468
6469   /* Possibly unaligned access.  */
6470
6471   /* We can choose between using the implicit realignment scheme (generating
6472      a misaligned_move stmt) and the explicit realignment scheme (generating
6473      aligned loads with a REALIGN_LOAD).  There are two variants to the
6474      explicit realignment scheme: optimized, and unoptimized.
6475      We can optimize the realignment only if the step between consecutive
6476      vector loads is equal to the vector size.  Since the vector memory
6477      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6478      is guaranteed that the misalignment amount remains the same throughout the
6479      execution of the vectorized loop.  Therefore, we can create the
6480      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6481      at the loop preheader.
6482
6483      However, in the case of outer-loop vectorization, when vectorizing a
6484      memory access in the inner-loop nested within the LOOP that is now being
6485      vectorized, while it is guaranteed that the misalignment of the
6486      vectorized memory access will remain the same in different outer-loop
6487      iterations, it is *not* guaranteed that is will remain the same throughout
6488      the execution of the inner-loop.  This is because the inner-loop advances
6489      with the original scalar step (and not in steps of VS).  If the inner-loop
6490      step happens to be a multiple of VS, then the misalignment remains fixed
6491      and we can use the optimized realignment scheme.  For example:
6492
6493       for (i=0; i<N; i++)
6494         for (j=0; j<M; j++)
6495           s += a[i+j];
6496
6497      When vectorizing the i-loop in the above example, the step between
6498      consecutive vector loads is 1, and so the misalignment does not remain
6499      fixed across the execution of the inner-loop, and the realignment cannot
6500      be optimized (as illustrated in the following pseudo vectorized loop):
6501
6502       for (i=0; i<N; i+=4)
6503         for (j=0; j<M; j++){
6504           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6505                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6506                          // (assuming that we start from an aligned address).
6507           }
6508
6509      We therefore have to use the unoptimized realignment scheme:
6510
6511       for (i=0; i<N; i+=4)
6512           for (j=k; j<M; j+=4)
6513           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6514                            // that the misalignment of the initial address is
6515                            // 0).
6516
6517      The loop can then be vectorized as follows:
6518
6519       for (k=0; k<4; k++){
6520         rt = get_realignment_token (&vp[k]);
6521         for (i=0; i<N; i+=4){
6522           v1 = vp[i+k];
6523           for (j=k; j<M; j+=4){
6524             v2 = vp[i+j+VS-1];
6525             va = REALIGN_LOAD <v1,v2,rt>;
6526             vs += va;
6527             v1 = v2;
6528           }
6529         }
6530     } */
6531
6532   if (DR_IS_READ (dr))
6533     {
6534       bool is_packed = false;
6535       tree type = (TREE_TYPE (DR_REF (dr)));
6536
6537       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6538           && (!targetm.vectorize.builtin_mask_for_load
6539               || targetm.vectorize.builtin_mask_for_load ()))
6540         {
6541           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6542
6543           /* If we are doing SLP then the accesses need not have the
6544              same alignment, instead it depends on the SLP group size.  */
6545           if (loop_vinfo
6546               && STMT_SLP_TYPE (stmt_info)
6547               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6548                               * DR_GROUP_SIZE (vinfo_for_stmt
6549                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6550                               TYPE_VECTOR_SUBPARTS (vectype)))
6551             ;
6552           else if (!loop_vinfo
6553                    || (nested_in_vect_loop
6554                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6555                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6556             return dr_explicit_realign;
6557           else
6558             return dr_explicit_realign_optimized;
6559         }
6560       if (!known_alignment_for_access_p (dr))
6561         is_packed = not_size_aligned (DR_REF (dr));
6562
6563       if (targetm.vectorize.support_vector_misalignment
6564             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6565         /* Can't software pipeline the loads, but can at least do them.  */
6566         return dr_unaligned_supported;
6567     }
6568   else
6569     {
6570       bool is_packed = false;
6571       tree type = (TREE_TYPE (DR_REF (dr)));
6572
6573       if (!known_alignment_for_access_p (dr))
6574         is_packed = not_size_aligned (DR_REF (dr));
6575
6576      if (targetm.vectorize.support_vector_misalignment
6577            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6578        return dr_unaligned_supported;
6579     }
6580
6581   /* Unsupported.  */
6582   return dr_unaligned_unsupported;
6583 }