gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 271         }
 272     }
 273   return true;
 274 }
 275
 276
 277 /* Function vect_analyze_data_ref_dependence.
 278
 279    Return TRUE if there (might) exist a dependence between a memory-reference
 280    DRA and a memory-reference DRB.  When versioning for alias may check a
 281    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 282    the data dependence.  */
 283
 284 static bool
 285 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 286                                   loop_vec_info loop_vinfo,
 287                                   unsigned int *max_vf)
 288 {
 289   unsigned int i;
 290   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 291   struct data_reference *dra = DDR_A (ddr);
 292   struct data_reference *drb = DDR_B (ddr);
 293   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 294   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 295   lambda_vector dist_v;
 296   unsigned int loop_depth;
 297
 298   /* In loop analysis all data references should be vectorizable.  */
 299   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 300       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 301     gcc_unreachable ();
 302
 303   /* Independent data accesses.  */
 304   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 305     return false;
 306
 307   if (dra == drb
 308       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 309     return false;
 310
 311   /* We do not have to consider dependences between accesses that belong
 312      to the same group, unless the stride could be smaller than the
 313      group size.  */
 314   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 315       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 316           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 317       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 318     return false;
 319
 320   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 321      least two scalar iterations, there is always also a true dependence.
 322      As the vectorizer does not re-order loads and stores we can ignore
 323      the anti-dependence if TBAA can disambiguate both DRs similar to the
 324      case with known negative distance anti-dependences (positive
 325      distance anti-dependences would violate TBAA constraints).  */
 326   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 327        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 328       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 329                                  get_alias_set (DR_REF (drb))))
 330     return false;
 331
 332   /* Unknown data dependence.  */
 333   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 334     {
 335       /* If user asserted safelen consecutive iterations can be
 336          executed concurrently, assume independence.  */
 337       if (loop->safelen >= 2)
 338         {
 339           if ((unsigned int) loop->safelen < *max_vf)
 340             *max_vf = loop->safelen;
 341           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 342           return false;
 343         }
 344
 345       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 346           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 347         {
 348           if (dump_enabled_p ())
 349             {
 350               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 351                                "versioning for alias not supported for: "
 352                                "can't determine dependence between ");
 353               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 354                                  DR_REF (dra));
 355               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 356               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 357                                  DR_REF (drb));
 358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 359             }
 360           return true;
 361         }
 362
 363       if (dump_enabled_p ())
 364         {
 365           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 366                            "versioning for alias required: "
 367                            "can't determine dependence between ");
 368           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 369                              DR_REF (dra));
 370           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 371           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 372                              DR_REF (drb));
 373           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 374         }
 375
 376       /* Add to list of ddrs that need to be tested at run-time.  */
 377       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 378     }
 379
 380   /* Known data dependence.  */
 381   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 382     {
 383       /* If user asserted safelen consecutive iterations can be
 384          executed concurrently, assume independence.  */
 385       if (loop->safelen >= 2)
 386         {
 387           if ((unsigned int) loop->safelen < *max_vf)
 388             *max_vf = loop->safelen;
 389           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 390           return false;
 391         }
 392
 393       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 394           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 395         {
 396           if (dump_enabled_p ())
 397             {
 398               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 399                                "versioning for alias not supported for: "
 400                                "bad dist vector for ");
 401               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 402                                  DR_REF (dra));
 403               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 404               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 405                                  DR_REF (drb));
 406               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 407             }
 408           return true;
 409         }
 410
 411       if (dump_enabled_p ())
 412         {
 413           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 414                            "versioning for alias required: "
 415                            "bad dist vector for ");
 416           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 417           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 418           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 419           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 420         }
 421       /* Add to list of ddrs that need to be tested at run-time.  */
 422       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 423     }
 424
 425   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 426
 427   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 428       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 429                                                 loop_depth, max_vf))
 430     return false;
 431
 432   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 433     {
 434       int dist = dist_v[loop_depth];
 435
 436       if (dump_enabled_p ())
 437         dump_printf_loc (MSG_NOTE, vect_location,
 438                          "dependence distance  = %d.\n", dist);
 439
 440       if (dist == 0)
 441         {
 442           if (dump_enabled_p ())
 443             {
 444               dump_printf_loc (MSG_NOTE, vect_location,
 445                                "dependence distance == 0 between ");
 446               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 447               dump_printf (MSG_NOTE, " and ");
 448               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 449               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 450             }
 451
 452           /* When we perform grouped accesses and perform implicit CSE
 453              by detecting equal accesses and doing disambiguation with
 454              runtime alias tests like for
 455                 .. = a[i];
 456                 .. = a[i+1];
 457                 a[i] = ..;
 458                 a[i+1] = ..;
 459                 *p = ..;
 460                 .. = a[i];
 461                 .. = a[i+1];
 462              where we will end up loading { a[i], a[i+1] } once, make
 463              sure that inserting group loads before the first load and
 464              stores after the last store will do the right thing.
 465              Similar for groups like
 466                 a[i] = ...;
 467                 ... = a[i];
 468                 a[i+1] = ...;
 469              where loads from the group interleave with the store.  */
 470           if (!vect_preserves_scalar_order_p (DR_STMT (dra), DR_STMT (drb)))
 471             {
 472               if (dump_enabled_p ())
 473                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                  "READ_WRITE dependence in interleaving.\n");
 475               return true;
 476             }
 477
 478           if (loop->safelen < 2)
 479             {
 480               tree indicator = dr_zero_step_indicator (dra);
 481               if (TREE_CODE (indicator) != INTEGER_CST)
 482                 vect_check_nonzero_value (loop_vinfo, indicator);
 483               else if (integer_zerop (indicator))
 484                 {
 485                   if (dump_enabled_p ())
 486                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 487                                  "access also has a zero step\n");
 488                   return true;
 489                 }
 490             }
 491           continue;
 492         }
 493
 494       if (dist > 0 && DDR_REVERSED_P (ddr))
 495         {
 496           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 497              reversed (to make distance vector positive), and the actual
 498              distance is negative.  */
 499           if (dump_enabled_p ())
 500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 501                              "dependence distance negative.\n");
 502           /* Record a negative dependence distance to later limit the
 503              amount of stmt copying / unrolling we can perform.
 504              Only need to handle read-after-write dependence.  */
 505           if (DR_IS_READ (drb)
 506               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 507                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 508             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 509           continue;
 510         }
 511
 512       unsigned int abs_dist = abs (dist);
 513       if (abs_dist >= 2 && abs_dist < *max_vf)
 514         {
 515           /* The dependence distance requires reduction of the maximal
 516              vectorization factor.  */
 517           *max_vf = abs (dist);
 518           if (dump_enabled_p ())
 519             dump_printf_loc (MSG_NOTE, vect_location,
 520                              "adjusting maximal vectorization factor to %i\n",
 521                              *max_vf);
 522         }
 523
 524       if (abs_dist >= *max_vf)
 525         {
 526           /* Dependence distance does not create dependence, as far as
 527              vectorization is concerned, in this case.  */
 528           if (dump_enabled_p ())
 529             dump_printf_loc (MSG_NOTE, vect_location,
 530                              "dependence distance >= VF.\n");
 531           continue;
 532         }
 533
 534       if (dump_enabled_p ())
 535         {
 536           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 537                        "not vectorized, possible dependence "
 538                        "between data-refs ");
 539           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 540           dump_printf (MSG_NOTE,  " and ");
 541           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 542           dump_printf (MSG_NOTE,  "\n");
 543         }
 544
 545       return true;
 546     }
 547
 548   return false;
 549 }
 550
 551 /* Function vect_analyze_data_ref_dependences.
 552
 553    Examine all the data references in the loop, and make sure there do not
 554    exist any data dependences between them.  Set *MAX_VF according to
 555    the maximum vectorization factor the data dependences allow.  */
 556
 557 bool
 558 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 559                                    unsigned int *max_vf)
 560 {
 561   unsigned int i;
 562   struct data_dependence_relation *ddr;
 563
 564   if (dump_enabled_p ())
 565     dump_printf_loc (MSG_NOTE, vect_location,
 566                      "=== vect_analyze_data_ref_dependences ===\n");
 567
 568   LOOP_VINFO_DDRS (loop_vinfo)
 569     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 570              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 571   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 572   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 573   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 574                                 &LOOP_VINFO_DDRS (loop_vinfo),
 575                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 576     return false;
 577
 578   /* For epilogues we either have no aliases or alias versioning
 579      was applied to original loop.  Therefore we may just get max_vf
 580      using VF of original loop.  */
 581   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 582     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 583   else
 584     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 585       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 586         return false;
 587
 588   return true;
 589 }
 590
 591
 592 /* Function vect_slp_analyze_data_ref_dependence.
 593
 594    Return TRUE if there (might) exist a dependence between a memory-reference
 595    DRA and a memory-reference DRB.  When versioning for alias may check a
 596    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 597    the data dependence.  */
 598
 599 static bool
 600 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 601 {
 602   struct data_reference *dra = DDR_A (ddr);
 603   struct data_reference *drb = DDR_B (ddr);
 604
 605   /* We need to check dependences of statements marked as unvectorizable
 606      as well, they still can prohibit vectorization.  */
 607
 608   /* Independent data accesses.  */
 609   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 610     return false;
 611
 612   if (dra == drb)
 613     return false;
 614
 615   /* Read-read is OK.  */
 616   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 617     return false;
 618
 619   /* If dra and drb are part of the same interleaving chain consider
 620      them independent.  */
 621   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 622       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 623           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 624     return false;
 625
 626   /* Unknown data dependence.  */
 627   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 628     {
 629       if  (dump_enabled_p ())
 630         {
 631           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 632                            "can't determine dependence between ");
 633           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 634           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 635           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 636           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 637         }
 638     }
 639   else if (dump_enabled_p ())
 640     {
 641       dump_printf_loc (MSG_NOTE, vect_location,
 642                        "determined dependence between ");
 643       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 644       dump_printf (MSG_NOTE, " and ");
 645       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 646       dump_printf (MSG_NOTE,  "\n");
 647     }
 648
 649   return true;
 650 }
 651
 652
 653 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 654    contain the vector of scalar stores of this instance if we are
 655    disambiguating the loads.  */
 656
 657 static bool
 658 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 659                                    vec<gimple *> stores, gimple *last_store)
 660 {
 661   /* This walks over all stmts involved in the SLP load/store done
 662      in NODE verifying we can sink them up to the last stmt in the
 663      group.  */
 664   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 665   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 666     {
 667       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 668       if (access == last_access)
 669         continue;
 670       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 671       ao_ref ref;
 672       bool ref_initialized_p = false;
 673       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 674            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 675         {
 676           gimple *stmt = gsi_stmt (gsi);
 677           if (! gimple_vuse (stmt)
 678               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 679             continue;
 680
 681           /* If we couldn't record a (single) data reference for this
 682              stmt we have to resort to the alias oracle.  */
 683           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 684           if (!dr_b)
 685             {
 686               /* We are moving a store or sinking a load - this means
 687                  we cannot use TBAA for disambiguation.  */
 688               if (!ref_initialized_p)
 689                 ao_ref_init (&ref, DR_REF (dr_a));
 690               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 691                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 692                 return false;
 693               continue;
 694             }
 695
 696           bool dependent = false;
 697           /* If we run into a store of this same instance (we've just
 698              marked those) then delay dependence checking until we run
 699              into the last store because this is where it will have
 700              been sunk to (and we verify if we can do that as well).  */
 701           if (gimple_visited_p (stmt))
 702             {
 703               if (stmt != last_store)
 704                 continue;
 705               unsigned i;
 706               gimple *store;
 707               FOR_EACH_VEC_ELT (stores, i, store)
 708                 {
 709                   data_reference *store_dr
 710                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 711                   ddr_p ddr = initialize_data_dependence_relation
 712                                 (dr_a, store_dr, vNULL);
 713                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 714                   free_dependence_relation (ddr);
 715                   if (dependent)
 716                     break;
 717                 }
 718             }
 719           else
 720             {
 721               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 722                                                                dr_b, vNULL);
 723               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 724               free_dependence_relation (ddr);
 725             }
 726           if (dependent)
 727             return false;
 728         }
 729     }
 730   return true;
 731 }
 732
 733
 734 /* Function vect_analyze_data_ref_dependences.
 735
 736    Examine all the data references in the basic-block, and make sure there
 737    do not exist any data dependences between them.  Set *MAX_VF according to
 738    the maximum vectorization factor the data dependences allow.  */
 739
 740 bool
 741 vect_slp_analyze_instance_dependence (slp_instance instance)
 742 {
 743   if (dump_enabled_p ())
 744     dump_printf_loc (MSG_NOTE, vect_location,
 745                      "=== vect_slp_analyze_instance_dependence ===\n");
 746
 747   /* The stores of this instance are at the root of the SLP tree.  */
 748   slp_tree store = SLP_INSTANCE_TREE (instance);
 749   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 750     store = NULL;
 751
 752   /* Verify we can sink stores to the vectorized stmt insert location.  */
 753   gimple *last_store = NULL;
 754   if (store)
 755     {
 756       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 757         return false;
 758
 759       /* Mark stores in this instance and remember the last one.  */
 760       last_store = vect_find_last_scalar_stmt_in_slp (store);
 761       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 762         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 763     }
 764
 765   bool res = true;
 766
 767   /* Verify we can sink loads to the vectorized stmt insert location,
 768      special-casing stores of this instance.  */
 769   slp_tree load;
 770   unsigned int i;
 771   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 772     if (! vect_slp_analyze_node_dependences (instance, load,
 773                                              store
 774                                              ? SLP_TREE_SCALAR_STMTS (store)
 775                                              : vNULL, last_store))
 776       {
 777         res = false;
 778         break;
 779       }
 780
 781   /* Unset the visited flag.  */
 782   if (store)
 783     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 784       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 785
 786   return res;
 787 }
 788
 789 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 790    the statement that contains DRB, which is useful for recording in the
 791    dump file.  */
 792
 793 static void
 794 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 795                             innermost_loop_behavior *drb)
 796 {
 797   bool existed;
 798   innermost_loop_behavior *&entry
 799     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 800   if (!existed || entry->base_alignment < drb->base_alignment)
 801     {
 802       entry = drb;
 803       if (dump_enabled_p ())
 804         {
 805           dump_printf_loc (MSG_NOTE, vect_location,
 806                            "recording new base alignment for ");
 807           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 808           dump_printf (MSG_NOTE, "\n");
 809           dump_printf_loc (MSG_NOTE, vect_location,
 810                            "  alignment:    %d\n", drb->base_alignment);
 811           dump_printf_loc (MSG_NOTE, vect_location,
 812                            "  misalignment: %d\n", drb->base_misalignment);
 813           dump_printf_loc (MSG_NOTE, vect_location,
 814                            "  based on:     ");
 815           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 816         }
 817     }
 818 }
 819
 820 /* If the region we're going to vectorize is reached, all unconditional
 821    data references occur at least once.  We can therefore pool the base
 822    alignment guarantees from each unconditional reference.  Do this by
 823    going through all the data references in VINFO and checking whether
 824    the containing statement makes the reference unconditionally.  If so,
 825    record the alignment of the base address in VINFO so that it can be
 826    used for all other references with the same base.  */
 827
 828 void
 829 vect_record_base_alignments (vec_info *vinfo)
 830 {
 831   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 832   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 833   data_reference *dr;
 834   unsigned int i;
 835   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 836     {
 837       gimple *stmt = DR_STMT (dr);
 838     if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 839         && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
 840       {
 841         gimple *stmt = DR_STMT (dr);
 842         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 843
 844         /* If DR is nested in the loop that is being vectorized, we can also
 845            record the alignment of the base wrt the outer loop.  */
 846         if (loop && nested_in_vect_loop_p (loop, stmt))
 847           {
 848             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 849             vect_record_base_alignment
 850               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 851           }
 852       }
 853     }
 854 }
 855
 856 /* Return the target alignment for the vectorized form of DR.  */
 857
 858 static unsigned int
 859 vect_calculate_target_alignment (struct data_reference *dr)
 860 {
 861   gimple *stmt = DR_STMT (dr);
 862   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 863   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 864   return targetm.vectorize.preferred_vector_alignment (vectype);
 865 }
 866
 867 /* Function vect_compute_data_ref_alignment
 868
 869    Compute the misalignment of the data reference DR.
 870
 871    Output:
 872    1. If during the misalignment computation it is found that the data reference
 873       cannot be vectorized then false is returned.
 874    2. DR_MISALIGNMENT (DR) is defined.
 875
 876    FOR NOW: No analysis is actually performed. Misalignment is calculated
 877    only for trivial cases. TODO.  */
 878
 879 bool
 880 vect_compute_data_ref_alignment (struct data_reference *dr)
 881 {
 882   gimple *stmt = DR_STMT (dr);
 883   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 884   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 885   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 886   struct loop *loop = NULL;
 887   tree ref = DR_REF (dr);
 888   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 889
 890   if (dump_enabled_p ())
 891     dump_printf_loc (MSG_NOTE, vect_location,
 892                      "vect_compute_data_ref_alignment:\n");
 893
 894   if (loop_vinfo)
 895     loop = LOOP_VINFO_LOOP (loop_vinfo);
 896
 897   /* Initialize misalignment to unknown.  */
 898   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 899
 900   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 901   bool step_preserves_misalignment_p;
 902
 903   unsigned HOST_WIDE_INT vector_alignment
 904     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 905   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 906
 907   /* No step for BB vectorization.  */
 908   if (!loop)
 909     {
 910       gcc_assert (integer_zerop (drb->step));
 911       step_preserves_misalignment_p = true;
 912     }
 913
 914   /* In case the dataref is in an inner-loop of the loop that is being
 915      vectorized (LOOP), we use the base and misalignment information
 916      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 917      stays the same throughout the execution of the inner-loop, which is why
 918      we have to check that the stride of the dataref in the inner-loop evenly
 919      divides by the vector alignment.  */
 920   else if (nested_in_vect_loop_p (loop, stmt))
 921     {
 922       step_preserves_misalignment_p
 923         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 924
 925       if (dump_enabled_p ())
 926         {
 927           if (step_preserves_misalignment_p)
 928             dump_printf_loc (MSG_NOTE, vect_location,
 929                              "inner step divides the vector alignment.\n");
 930           else
 931             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                              "inner step doesn't divide the vector"
 933                              " alignment.\n");
 934         }
 935     }
 936
 937   /* Similarly we can only use base and misalignment information relative to
 938      an innermost loop if the misalignment stays the same throughout the
 939      execution of the loop.  As above, this is the case if the stride of
 940      the dataref evenly divides by the alignment.  */
 941   else
 942     {
 943       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 944       step_preserves_misalignment_p
 945         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 946
 947       if (!step_preserves_misalignment_p && dump_enabled_p ())
 948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 949                          "step doesn't divide the vector alignment.\n");
 950     }
 951
 952   unsigned int base_alignment = drb->base_alignment;
 953   unsigned int base_misalignment = drb->base_misalignment;
 954
 955   /* Calculate the maximum of the pooled base address alignment and the
 956      alignment that we can compute for DR itself.  */
 957   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 958   if (entry && base_alignment < (*entry)->base_alignment)
 959     {
 960       base_alignment = (*entry)->base_alignment;
 961       base_misalignment = (*entry)->base_misalignment;
 962     }
 963
 964   if (drb->offset_alignment < vector_alignment
 965       || !step_preserves_misalignment_p
 966       /* We need to know whether the step wrt the vectorized loop is
 967          negative when computing the starting misalignment below.  */
 968       || TREE_CODE (drb->step) != INTEGER_CST)
 969     {
 970       if (dump_enabled_p ())
 971         {
 972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 973                            "Unknown alignment for access: ");
 974           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 975           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 976         }
 977       return true;
 978     }
 979
 980   if (base_alignment < vector_alignment)
 981     {
 982       unsigned int max_alignment;
 983       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 984       if (max_alignment < vector_alignment
 985           || !vect_can_force_dr_alignment_p (base,
 986                                              vector_alignment * BITS_PER_UNIT))
 987         {
 988           if (dump_enabled_p ())
 989             {
 990               dump_printf_loc (MSG_NOTE, vect_location,
 991                                "can't force alignment of ref: ");
 992               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 993               dump_printf (MSG_NOTE, "\n");
 994             }
 995           return true;
 996         }
 997
 998       /* Force the alignment of the decl.
 999          NOTE: This is the only change to the code we make during
1000          the analysis phase, before deciding to vectorize the loop.  */
1001       if (dump_enabled_p ())
1002         {
1003           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1004           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005           dump_printf (MSG_NOTE, "\n");
1006         }
1007
1008       DR_VECT_AUX (dr)->base_decl = base;
1009       DR_VECT_AUX (dr)->base_misaligned = true;
1010       base_misalignment = 0;
1011     }
1012   poly_int64 misalignment
1013     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1014
1015   /* If this is a backward running DR then first access in the larger
1016      vectype actually is N-1 elements before the address in the DR.
1017      Adjust misalign accordingly.  */
1018   if (tree_int_cst_sgn (drb->step) < 0)
1019     /* PLUS because STEP is negative.  */
1020     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1021                      * TREE_INT_CST_LOW (drb->step));
1022
1023   unsigned int const_misalignment;
1024   if (!known_misalignment (misalignment, vector_alignment,
1025                            &const_misalignment))
1026     {
1027       if (dump_enabled_p ())
1028         {
1029           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030                            "Non-constant misalignment for access: ");
1031           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1032           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1033         }
1034       return true;
1035     }
1036
1037   SET_DR_MISALIGNMENT (dr, const_misalignment);
1038
1039   if (dump_enabled_p ())
1040     {
1041       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1043       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045     }
1046
1047   return true;
1048 }
1049
1050 /* Function vect_update_misalignment_for_peel.
1051    Sets DR's misalignment
1052    - to 0 if it has the same alignment as DR_PEEL,
1053    - to the misalignment computed using NPEEL if DR's salignment is known,
1054    - to -1 (unknown) otherwise.
1055
1056    DR - the data reference whose misalignment is to be adjusted.
1057    DR_PEEL - the data reference whose misalignment is being made
1058              zero in the vector loop by the peel.
1059    NPEEL - the number of iterations in the peel loop if the misalignment
1060            of DR_PEEL is known at compile time.  */
1061
1062 static void
1063 vect_update_misalignment_for_peel (struct data_reference *dr,
1064                                    struct data_reference *dr_peel, int npeel)
1065 {
1066   unsigned int i;
1067   vec<dr_p> same_aligned_drs;
1068   struct data_reference *current_dr;
1069   int dr_size = vect_get_scalar_dr_size (dr);
1070   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1071   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1072   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1073
1074  /* For interleaved data accesses the step in the loop must be multiplied by
1075      the size of the interleaving group.  */
1076   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1078   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1079     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1080
1081   /* It can be assumed that the data refs with the same alignment as dr_peel
1082      are aligned in the vector loop.  */
1083   same_aligned_drs
1084     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1085   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1086     {
1087       if (current_dr != dr)
1088         continue;
1089       gcc_assert (!known_alignment_for_access_p (dr)
1090                   || !known_alignment_for_access_p (dr_peel)
1091                   || (DR_MISALIGNMENT (dr) / dr_size
1092                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1093       SET_DR_MISALIGNMENT (dr, 0);
1094       return;
1095     }
1096
1097   if (known_alignment_for_access_p (dr)
1098       && known_alignment_for_access_p (dr_peel))
1099     {
1100       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1101       int misal = DR_MISALIGNMENT (dr);
1102       misal += negative ? -npeel * dr_size : npeel * dr_size;
1103       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1104       SET_DR_MISALIGNMENT (dr, misal);
1105       return;
1106     }
1107
1108   if (dump_enabled_p ())
1109     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1110                      "to unknown (-1).\n");
1111   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1112 }
1113
1114
1115 /* Function verify_data_ref_alignment
1116
1117    Return TRUE if DR can be handled with respect to alignment.  */
1118
1119 static bool
1120 verify_data_ref_alignment (data_reference_p dr)
1121 {
1122   enum dr_alignment_support supportable_dr_alignment
1123     = vect_supportable_dr_alignment (dr, false);
1124   if (!supportable_dr_alignment)
1125     {
1126       if (dump_enabled_p ())
1127         {
1128           if (DR_IS_READ (dr))
1129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130                              "not vectorized: unsupported unaligned load.");
1131           else
1132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                              "not vectorized: unsupported unaligned "
1134                              "store.");
1135
1136           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1137                              DR_REF (dr));
1138           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1139         }
1140       return false;
1141     }
1142
1143   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1144     dump_printf_loc (MSG_NOTE, vect_location,
1145                      "Vectorizing an unaligned access.\n");
1146
1147   return true;
1148 }
1149
1150 /* Function vect_verify_datarefs_alignment
1151
1152    Return TRUE if all data references in the loop can be
1153    handled with respect to alignment.  */
1154
1155 bool
1156 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1157 {
1158   vec<data_reference_p> datarefs = vinfo->datarefs;
1159   struct data_reference *dr;
1160   unsigned int i;
1161
1162   FOR_EACH_VEC_ELT (datarefs, i, dr)
1163     {
1164       gimple *stmt = DR_STMT (dr);
1165       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1166
1167       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1168         continue;
1169
1170       /* For interleaving, only the alignment of the first access matters.   */
1171       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1172           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1173         continue;
1174
1175       /* Strided accesses perform only component accesses, alignment is
1176          irrelevant for them.  */
1177       if (STMT_VINFO_STRIDED_P (stmt_info)
1178           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1179         continue;
1180
1181       if (! verify_data_ref_alignment (dr))
1182         return false;
1183     }
1184
1185   return true;
1186 }
1187
1188 /* Given an memory reference EXP return whether its alignment is less
1189    than its size.  */
1190
1191 static bool
1192 not_size_aligned (tree exp)
1193 {
1194   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1195     return true;
1196
1197   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1198           > get_object_alignment (exp));
1199 }
1200
1201 /* Function vector_alignment_reachable_p
1202
1203    Return true if vector alignment for DR is reachable by peeling
1204    a few loop iterations.  Return false otherwise.  */
1205
1206 static bool
1207 vector_alignment_reachable_p (struct data_reference *dr)
1208 {
1209   gimple *stmt = DR_STMT (dr);
1210   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1212
1213   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1214     {
1215       /* For interleaved access we peel only if number of iterations in
1216          the prolog loop ({VF - misalignment}), is a multiple of the
1217          number of the interleaved accesses.  */
1218       int elem_size, mis_in_elements;
1219
1220       /* FORNOW: handle only known alignment.  */
1221       if (!known_alignment_for_access_p (dr))
1222         return false;
1223
1224       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226       elem_size = vector_element_size (vector_size, nelements);
1227       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1228
1229       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230         return false;
1231     }
1232
1233   /* If misalignment is known at the compile time then allow peeling
1234      only if natural alignment is reachable through peeling.  */
1235   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1236     {
1237       HOST_WIDE_INT elmsize =
1238                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239       if (dump_enabled_p ())
1240         {
1241           dump_printf_loc (MSG_NOTE, vect_location,
1242                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243           dump_printf (MSG_NOTE,
1244                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1245         }
1246       if (DR_MISALIGNMENT (dr) % elmsize)
1247         {
1248           if (dump_enabled_p ())
1249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                              "data size does not divide the misalignment.\n");
1251           return false;
1252         }
1253     }
1254
1255   if (!known_alignment_for_access_p (dr))
1256     {
1257       tree type = TREE_TYPE (DR_REF (dr));
1258       bool is_packed = not_size_aligned (DR_REF (dr));
1259       if (dump_enabled_p ())
1260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261                          "Unknown misalignment, %snaturally aligned\n",
1262                          is_packed ? "not " : "");
1263       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1264     }
1265
1266   return true;
1267 }
1268
1269
1270 /* Calculate the cost of the memory access represented by DR.  */
1271
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274                            unsigned int *inside_cost,
1275                            unsigned int *outside_cost,
1276                            stmt_vector_for_cost *body_cost_vec,
1277                            stmt_vector_for_cost *prologue_cost_vec)
1278 {
1279   gimple *stmt = DR_STMT (dr);
1280   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1281   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282   int ncopies;
1283
1284   if (PURE_SLP_STMT (stmt_info))
1285     ncopies = 1;
1286   else
1287     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1288
1289   if (DR_IS_READ (dr))
1290     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1291                         prologue_cost_vec, body_cost_vec, false);
1292   else
1293     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1294
1295   if (dump_enabled_p ())
1296     dump_printf_loc (MSG_NOTE, vect_location,
1297                      "vect_get_data_access_cost: inside_cost = %d, "
1298                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1299 }
1300
1301
1302 typedef struct _vect_peel_info
1303 {
1304   struct data_reference *dr;
1305   int npeel;
1306   unsigned int count;
1307 } *vect_peel_info;
1308
1309 typedef struct _vect_peel_extended_info
1310 {
1311   struct _vect_peel_info peel_info;
1312   unsigned int inside_cost;
1313   unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1315
1316
1317 /* Peeling hashtable helpers.  */
1318
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1320 {
1321   static inline hashval_t hash (const _vect_peel_info *);
1322   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1323 };
1324
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1327 {
1328   return (hashval_t) peel_info->npeel;
1329 }
1330
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1333 {
1334   return (a->npeel == b->npeel);
1335 }
1336
1337
1338 /* Insert DR into peeling hash table with NPEEL as key.  */
1339
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342                           loop_vec_info loop_vinfo, struct data_reference *dr,
1343                           int npeel)
1344 {
1345   struct _vect_peel_info elem, *slot;
1346   _vect_peel_info **new_slot;
1347   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1348
1349   elem.npeel = npeel;
1350   slot = peeling_htab->find (&elem);
1351   if (slot)
1352     slot->count++;
1353   else
1354     {
1355       slot = XNEW (struct _vect_peel_info);
1356       slot->npeel = npeel;
1357       slot->dr = dr;
1358       slot->count = 1;
1359       new_slot = peeling_htab->find_slot (slot, INSERT);
1360       *new_slot = slot;
1361     }
1362
1363   if (!supportable_dr_alignment
1364       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365     slot->count += VECT_MAX_COST;
1366 }
1367
1368
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370    number of data accesses.  */
1371
1372 int
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374                                      _vect_peel_extended_info *max)
1375 {
1376   vect_peel_info elem = *slot;
1377
1378   if (elem->count > max->peel_info.count
1379       || (elem->count == max->peel_info.count
1380           && max->peel_info.npeel > elem->npeel))
1381     {
1382       max->peel_info.npeel = elem->npeel;
1383       max->peel_info.count = elem->count;
1384       max->peel_info.dr = elem->dr;
1385     }
1386
1387   return 1;
1388 }
1389
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392    misalignment will be zero after peeling.  */
1393
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396                                 struct data_reference *dr0,
1397                                 unsigned int *inside_cost,
1398                                 unsigned int *outside_cost,
1399                                 stmt_vector_for_cost *body_cost_vec,
1400                                 stmt_vector_for_cost *prologue_cost_vec,
1401                                 unsigned int npeel,
1402                                 bool unknown_misalignment)
1403 {
1404   unsigned i;
1405   data_reference *dr;
1406
1407   FOR_EACH_VEC_ELT (datarefs, i, dr)
1408     {
1409       gimple *stmt = DR_STMT (dr);
1410       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412         continue;
1413
1414       /* For interleaving, only the alignment of the first access
1415          matters.  */
1416       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418         continue;
1419
1420       /* Strided accesses perform only component accesses, alignment is
1421          irrelevant for them.  */
1422       if (STMT_VINFO_STRIDED_P (stmt_info)
1423           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1424         continue;
1425
1426       int save_misalignment;
1427       save_misalignment = DR_MISALIGNMENT (dr);
1428       if (npeel == 0)
1429         ;
1430       else if (unknown_misalignment && dr == dr0)
1431         SET_DR_MISALIGNMENT (dr, 0);
1432       else
1433         vect_update_misalignment_for_peel (dr, dr0, npeel);
1434       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1435                                  body_cost_vec, prologue_cost_vec);
1436       SET_DR_MISALIGNMENT (dr, save_misalignment);
1437     }
1438 }
1439
1440 /* Traverse peeling hash table and calculate cost for each peeling option.
1441    Find the one with the lowest cost.  */
1442
1443 int
1444 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1445                                    _vect_peel_extended_info *min)
1446 {
1447   vect_peel_info elem = *slot;
1448   int dummy;
1449   unsigned int inside_cost = 0, outside_cost = 0;
1450   gimple *stmt = DR_STMT (elem->dr);
1451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1453   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1454                        epilogue_cost_vec;
1455
1456   prologue_cost_vec.create (2);
1457   body_cost_vec.create (2);
1458   epilogue_cost_vec.create (2);
1459
1460   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1461                                   elem->dr, &inside_cost, &outside_cost,
1462                                   &body_cost_vec, &prologue_cost_vec,
1463                                   elem->npeel, false);
1464
1465   body_cost_vec.release ();
1466
1467   outside_cost += vect_get_known_peeling_cost
1468     (loop_vinfo, elem->npeel, &dummy,
1469      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1470      &prologue_cost_vec, &epilogue_cost_vec);
1471
1472   /* Prologue and epilogue costs are added to the target model later.
1473      These costs depend only on the scalar iteration cost, the
1474      number of peeling iterations finally chosen, and the number of
1475      misaligned statements.  So discard the information found here.  */
1476   prologue_cost_vec.release ();
1477   epilogue_cost_vec.release ();
1478
1479   if (inside_cost < min->inside_cost
1480       || (inside_cost == min->inside_cost
1481           && outside_cost < min->outside_cost))
1482     {
1483       min->inside_cost = inside_cost;
1484       min->outside_cost = outside_cost;
1485       min->peel_info.dr = elem->dr;
1486       min->peel_info.npeel = elem->npeel;
1487       min->peel_info.count = elem->count;
1488     }
1489
1490   return 1;
1491 }
1492
1493
1494 /* Choose best peeling option by traversing peeling hash table and either
1495    choosing an option with the lowest cost (if cost model is enabled) or the
1496    option that aligns as many accesses as possible.  */
1497
1498 static struct _vect_peel_extended_info
1499 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1500                                        loop_vec_info loop_vinfo)
1501 {
1502    struct _vect_peel_extended_info res;
1503
1504    res.peel_info.dr = NULL;
1505
1506    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507      {
1508        res.inside_cost = INT_MAX;
1509        res.outside_cost = INT_MAX;
1510        peeling_htab->traverse <_vect_peel_extended_info *,
1511                                vect_peeling_hash_get_lowest_cost> (&res);
1512      }
1513    else
1514      {
1515        res.peel_info.count = 0;
1516        peeling_htab->traverse <_vect_peel_extended_info *,
1517                                vect_peeling_hash_get_most_frequent> (&res);
1518        res.inside_cost = 0;
1519        res.outside_cost = 0;
1520      }
1521
1522    return res;
1523 }
1524
1525 /* Return true if the new peeling NPEEL is supported.  */
1526
1527 static bool
1528 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1529                           unsigned npeel)
1530 {
1531   unsigned i;
1532   struct data_reference *dr = NULL;
1533   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534   gimple *stmt;
1535   stmt_vec_info stmt_info;
1536   enum dr_alignment_support supportable_dr_alignment;
1537
1538   /* Ensure that all data refs can be vectorized after the peel.  */
1539   FOR_EACH_VEC_ELT (datarefs, i, dr)
1540     {
1541       int save_misalignment;
1542
1543       if (dr == dr0)
1544         continue;
1545
1546       stmt = DR_STMT (dr);
1547       stmt_info = vinfo_for_stmt (stmt);
1548       /* For interleaving, only the alignment of the first access
1549          matters.  */
1550       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1551           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1552         continue;
1553
1554       /* Strided accesses perform only component accesses, alignment is
1555          irrelevant for them.  */
1556       if (STMT_VINFO_STRIDED_P (stmt_info)
1557           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1558         continue;
1559
1560       save_misalignment = DR_MISALIGNMENT (dr);
1561       vect_update_misalignment_for_peel (dr, dr0, npeel);
1562       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1563       SET_DR_MISALIGNMENT (dr, save_misalignment);
1564
1565       if (!supportable_dr_alignment)
1566         return false;
1567     }
1568
1569   return true;
1570 }
1571
1572 /* Function vect_enhance_data_refs_alignment
1573
1574    This pass will use loop versioning and loop peeling in order to enhance
1575    the alignment of data references in the loop.
1576
1577    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1578    original loop is to be vectorized.  Any other loops that are created by
1579    the transformations performed in this pass - are not supposed to be
1580    vectorized.  This restriction will be relaxed.
1581
1582    This pass will require a cost model to guide it whether to apply peeling
1583    or versioning or a combination of the two.  For example, the scheme that
1584    intel uses when given a loop with several memory accesses, is as follows:
1585    choose one memory access ('p') which alignment you want to force by doing
1586    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1587    other accesses are not necessarily aligned, or (2) use loop versioning to
1588    generate one loop in which all accesses are aligned, and another loop in
1589    which only 'p' is necessarily aligned.
1590
1591    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1592    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1593    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1594
1595    Devising a cost model is the most critical aspect of this work.  It will
1596    guide us on which access to peel for, whether to use loop versioning, how
1597    many versions to create, etc.  The cost model will probably consist of
1598    generic considerations as well as target specific considerations (on
1599    powerpc for example, misaligned stores are more painful than misaligned
1600    loads).
1601
1602    Here are the general steps involved in alignment enhancements:
1603
1604      -- original loop, before alignment analysis:
1605         for (i=0; i<N; i++){
1606           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1607           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1608         }
1609
1610      -- After vect_compute_data_refs_alignment:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- Possibility 1: we do loop versioning:
1617      if (p is aligned) {
1618         for (i=0; i<N; i++){    # loop 1A
1619           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1620           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1621         }
1622      }
1623      else {
1624         for (i=0; i<N; i++){    # loop 1B
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1627         }
1628      }
1629
1630      -- Possibility 2: we do loop peeling:
1631      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1632         x = q[i];
1633         p[i] = y;
1634      }
1635      for (i = 3; i < N; i++){   # loop 2A
1636         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1637         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1638      }
1639
1640      -- Possibility 3: combination of loop peeling and versioning:
1641      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1642         x = q[i];
1643         p[i] = y;
1644      }
1645      if (p is aligned) {
1646         for (i = 3; i<N; i++){  # loop 3A
1647           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1648           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1649         }
1650      }
1651      else {
1652         for (i = 3; i<N; i++){  # loop 3B
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1655         }
1656      }
1657
1658      These loops are later passed to loop_transform to be vectorized.  The
1659      vectorizer will use the alignment information to guide the transformation
1660      (whether to generate regular loads/stores, or with special handling for
1661      misalignment).  */
1662
1663 bool
1664 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1665 {
1666   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1667   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669   struct data_reference *dr0 = NULL, *first_store = NULL;
1670   struct data_reference *dr;
1671   unsigned int i, j;
1672   bool do_peeling = false;
1673   bool do_versioning = false;
1674   bool stat;
1675   gimple *stmt;
1676   stmt_vec_info stmt_info;
1677   unsigned int npeel = 0;
1678   bool one_misalignment_known = false;
1679   bool one_misalignment_unknown = false;
1680   bool one_dr_unsupportable = false;
1681   struct data_reference *unsupportable_dr = NULL;
1682   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1683   unsigned possible_npeel_number = 1;
1684   tree vectype;
1685   unsigned int mis, same_align_drs_max = 0;
1686   hash_table<peel_info_hasher> peeling_htab (1);
1687
1688   if (dump_enabled_p ())
1689     dump_printf_loc (MSG_NOTE, vect_location,
1690                      "=== vect_enhance_data_refs_alignment ===\n");
1691
1692   /* Reset data so we can safely be called multiple times.  */
1693   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1694   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1695
1696   /* While cost model enhancements are expected in the future, the high level
1697      view of the code at this time is as follows:
1698
1699      A) If there is a misaligned access then see if peeling to align
1700         this access can make all data references satisfy
1701         vect_supportable_dr_alignment.  If so, update data structures
1702         as needed and return true.
1703
1704      B) If peeling wasn't possible and there is a data reference with an
1705         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1706         then see if loop versioning checks can be used to make all data
1707         references satisfy vect_supportable_dr_alignment.  If so, update
1708         data structures as needed and return true.
1709
1710      C) If neither peeling nor versioning were successful then return false if
1711         any data reference does not satisfy vect_supportable_dr_alignment.
1712
1713      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1714
1715      Note, Possibility 3 above (which is peeling and versioning together) is not
1716      being done at this time.  */
1717
1718   /* (1) Peeling to force alignment.  */
1719
1720   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1721      Considerations:
1722      + How many accesses will become aligned due to the peeling
1723      - How many accesses will become unaligned due to the peeling,
1724        and the cost of misaligned accesses.
1725      - The cost of peeling (the extra runtime checks, the increase
1726        in code size).  */
1727
1728   FOR_EACH_VEC_ELT (datarefs, i, dr)
1729     {
1730       stmt = DR_STMT (dr);
1731       stmt_info = vinfo_for_stmt (stmt);
1732
1733       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1734         continue;
1735
1736       /* For interleaving, only the alignment of the first access
1737          matters.  */
1738       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1739           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1740         continue;
1741
1742       /* For invariant accesses there is nothing to enhance.  */
1743       if (integer_zerop (DR_STEP (dr)))
1744         continue;
1745
1746       /* Strided accesses perform only component accesses, alignment is
1747          irrelevant for them.  */
1748       if (STMT_VINFO_STRIDED_P (stmt_info)
1749           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1750         continue;
1751
1752       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1753       do_peeling = vector_alignment_reachable_p (dr);
1754       if (do_peeling)
1755         {
1756           if (known_alignment_for_access_p (dr))
1757             {
1758               unsigned int npeel_tmp = 0;
1759               bool negative = tree_int_cst_compare (DR_STEP (dr),
1760                                                     size_zero_node) < 0;
1761
1762               vectype = STMT_VINFO_VECTYPE (stmt_info);
1763               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1764               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1765               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1766               if (DR_MISALIGNMENT (dr) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0 = dr;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   struct loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0 = dr;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr = dr;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (datarefs, dr0,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (datarefs, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0 = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS
1941         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1942     }
1943
1944   peel_for_unknown_alignment.peel_info.npeel = 0;
1945   peel_for_unknown_alignment.peel_info.dr = dr0;
1946
1947   best_peel = peel_for_unknown_alignment;
1948
1949   peel_for_known_alignment.inside_cost = INT_MAX;
1950   peel_for_known_alignment.outside_cost = INT_MAX;
1951   peel_for_known_alignment.peel_info.count = 0;
1952   peel_for_known_alignment.peel_info.dr = NULL;
1953
1954   if (do_peeling && one_misalignment_known)
1955     {
1956       /* Peeling is possible, but there is no data access that is not supported
1957          unless aligned.  So we try to choose the best possible peeling from
1958          the hash table.  */
1959       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1960         (&peeling_htab, loop_vinfo);
1961     }
1962
1963   /* Compare costs of peeling for known and unknown alignment. */
1964   if (peel_for_known_alignment.peel_info.dr != NULL
1965       && peel_for_unknown_alignment.inside_cost
1966       >= peel_for_known_alignment.inside_cost)
1967     {
1968       best_peel = peel_for_known_alignment;
1969
1970       /* If the best peeling for known alignment has NPEEL == 0, perform no
1971          peeling at all except if there is an unsupportable dr that we can
1972          align.  */
1973       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1974         do_peeling = false;
1975     }
1976
1977   /* If there is an unsupportable data ref, prefer this over all choices so far
1978      since we'd have to discard a chosen peeling except when it accidentally
1979      aligned the unsupportable data ref.  */
1980   if (one_dr_unsupportable)
1981     dr0 = unsupportable_dr;
1982   else if (do_peeling)
1983     {
1984       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1985          TODO: Use nopeel_outside_cost or get rid of it?  */
1986       unsigned nopeel_inside_cost = 0;
1987       unsigned nopeel_outside_cost = 0;
1988
1989       stmt_vector_for_cost dummy;
1990       dummy.create (2);
1991       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1992                                       &nopeel_outside_cost, &dummy, &dummy,
1993                                       0, false);
1994       dummy.release ();
1995
1996       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1997          costs will be recorded.  */
1998       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1999       prologue_cost_vec.create (2);
2000       epilogue_cost_vec.create (2);
2001
2002       int dummy2;
2003       nopeel_outside_cost += vect_get_known_peeling_cost
2004         (loop_vinfo, 0, &dummy2,
2005          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006          &prologue_cost_vec, &epilogue_cost_vec);
2007
2008       prologue_cost_vec.release ();
2009       epilogue_cost_vec.release ();
2010
2011       npeel = best_peel.peel_info.npeel;
2012       dr0 = best_peel.peel_info.dr;
2013
2014       /* If no peeling is not more expensive than the best peeling we
2015          have so far, don't perform any peeling.  */
2016       if (nopeel_inside_cost <= best_peel.inside_cost)
2017         do_peeling = false;
2018     }
2019
2020   if (do_peeling)
2021     {
2022       stmt = DR_STMT (dr0);
2023       stmt_info = vinfo_for_stmt (stmt);
2024       vectype = STMT_VINFO_VECTYPE (stmt_info);
2025
2026       if (known_alignment_for_access_p (dr0))
2027         {
2028           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2029                                                 size_zero_node) < 0;
2030           if (!npeel)
2031             {
2032               /* Since it's known at compile time, compute the number of
2033                  iterations in the peeled loop (the peeling factor) for use in
2034                  updating DR_MISALIGNMENT values.  The peeling factor is the
2035                  vectorization factor minus the misalignment as an element
2036                  count.  */
2037               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2038               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2039               npeel = ((mis & (target_align - 1))
2040                        / vect_get_scalar_dr_size (dr0));
2041             }
2042
2043           /* For interleaved data access every iteration accesses all the
2044              members of the group, therefore we divide the number of iterations
2045              by the group size.  */
2046           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
2047           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048             npeel /= DR_GROUP_SIZE (stmt_info);
2049
2050           if (dump_enabled_p ())
2051             dump_printf_loc (MSG_NOTE, vect_location,
2052                              "Try peeling by %d\n", npeel);
2053         }
2054
2055       /* Ensure that all datarefs can be vectorized after the peel.  */
2056       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2057         do_peeling = false;
2058
2059       /* Check if all datarefs are supportable and log.  */
2060       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2061         {
2062           stat = vect_verify_datarefs_alignment (loop_vinfo);
2063           if (!stat)
2064             do_peeling = false;
2065           else
2066             return stat;
2067         }
2068
2069       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2070       if (do_peeling)
2071         {
2072           unsigned max_allowed_peel
2073             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2074           if (max_allowed_peel != (unsigned)-1)
2075             {
2076               unsigned max_peel = npeel;
2077               if (max_peel == 0)
2078                 {
2079                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2080                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2081                 }
2082               if (max_peel > max_allowed_peel)
2083                 {
2084                   do_peeling = false;
2085                   if (dump_enabled_p ())
2086                     dump_printf_loc (MSG_NOTE, vect_location,
2087                         "Disable peeling, max peels reached: %d\n", max_peel);
2088                 }
2089             }
2090         }
2091
2092       /* Cost model #2 - if peeling may result in a remaining loop not
2093          iterating enough to be vectorized then do not peel.  Since this
2094          is a cost heuristic rather than a correctness decision, use the
2095          most likely runtime value for variable vectorization factors.  */
2096       if (do_peeling
2097           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2098         {
2099           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2100           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2101           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2102               < assumed_vf + max_peel)
2103             do_peeling = false;
2104         }
2105
2106       if (do_peeling)
2107         {
2108           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2109              If the misalignment of DR_i is identical to that of dr0 then set
2110              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2111              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2112              by the peeling factor times the element size of DR_i (MOD the
2113              vectorization factor times the size).  Otherwise, the
2114              misalignment of DR_i must be set to unknown.  */
2115           FOR_EACH_VEC_ELT (datarefs, i, dr)
2116             if (dr != dr0)
2117               {
2118                 /* Strided accesses perform only component accesses, alignment
2119                    is irrelevant for them.  */
2120                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2121                 if (STMT_VINFO_STRIDED_P (stmt_info)
2122                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2123                   continue;
2124
2125                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2126               }
2127
2128           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2129           if (npeel)
2130             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2131           else
2132             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2133               = DR_MISALIGNMENT (dr0);
2134           SET_DR_MISALIGNMENT (dr0, 0);
2135           if (dump_enabled_p ())
2136             {
2137               dump_printf_loc (MSG_NOTE, vect_location,
2138                                "Alignment of access forced using peeling.\n");
2139               dump_printf_loc (MSG_NOTE, vect_location,
2140                                "Peeling for alignment will be applied.\n");
2141             }
2142
2143           /* The inside-loop cost will be accounted for in vectorizable_load
2144              and vectorizable_store correctly with adjusted alignments.
2145              Drop the body_cst_vec on the floor here.  */
2146           stat = vect_verify_datarefs_alignment (loop_vinfo);
2147           gcc_assert (stat);
2148           return stat;
2149         }
2150     }
2151
2152   /* (2) Versioning to force alignment.  */
2153
2154   /* Try versioning if:
2155      1) optimize loop for speed
2156      2) there is at least one unsupported misaligned data ref with an unknown
2157         misalignment, and
2158      3) all misaligned data refs with a known misalignment are supported, and
2159      4) the number of runtime alignment checks is within reason.  */
2160
2161   do_versioning =
2162         optimize_loop_nest_for_speed_p (loop)
2163         && (!loop->inner); /* FORNOW */
2164
2165   if (do_versioning)
2166     {
2167       FOR_EACH_VEC_ELT (datarefs, i, dr)
2168         {
2169           stmt = DR_STMT (dr);
2170           stmt_info = vinfo_for_stmt (stmt);
2171
2172           /* For interleaving, only the alignment of the first access
2173              matters.  */
2174           if (aligned_access_p (dr)
2175               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2176                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2177             continue;
2178
2179           if (STMT_VINFO_STRIDED_P (stmt_info))
2180             {
2181               /* Strided loads perform only component accesses, alignment is
2182                  irrelevant for them.  */
2183               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184                 continue;
2185               do_versioning = false;
2186               break;
2187             }
2188
2189           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2190
2191           if (!supportable_dr_alignment)
2192             {
2193               gimple *stmt;
2194               int mask;
2195               tree vectype;
2196
2197               if (known_alignment_for_access_p (dr)
2198                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2199                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2200                 {
2201                   do_versioning = false;
2202                   break;
2203                 }
2204
2205               stmt = DR_STMT (dr);
2206               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2207               gcc_assert (vectype);
2208
2209               /* At present we don't support versioning for alignment
2210                  with variable VF, since there's no guarantee that the
2211                  VF is a power of two.  We could relax this if we added
2212                  a way of enforcing a power-of-two size.  */
2213               unsigned HOST_WIDE_INT size;
2214               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2215                 {
2216                   do_versioning = false;
2217                   break;
2218                 }
2219
2220               /* The rightmost bits of an aligned address must be zeros.
2221                  Construct the mask needed for this test.  For example,
2222                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2223                  mask must be 15 = 0xf. */
2224               mask = size - 1;
2225
2226               /* FORNOW: use the same mask to test all potentially unaligned
2227                  references in the loop.  The vectorizer currently supports
2228                  a single vector size, see the reference to
2229                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2230                  vectorization factor is computed.  */
2231               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2232                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2233               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2234               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2235                       DR_STMT (dr));
2236             }
2237         }
2238
2239       /* Versioning requires at least one misaligned data reference.  */
2240       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2241         do_versioning = false;
2242       else if (!do_versioning)
2243         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2244     }
2245
2246   if (do_versioning)
2247     {
2248       vec<gimple *> may_misalign_stmts
2249         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2250       gimple *stmt;
2251
2252       /* It can now be assumed that the data references in the statements
2253          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2254          of the loop being vectorized.  */
2255       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2256         {
2257           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258           dr = STMT_VINFO_DATA_REF (stmt_info);
2259           SET_DR_MISALIGNMENT (dr, 0);
2260           if (dump_enabled_p ())
2261             dump_printf_loc (MSG_NOTE, vect_location,
2262                              "Alignment of access forced using versioning.\n");
2263         }
2264
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_NOTE, vect_location,
2267                          "Versioning for alignment will be applied.\n");
2268
2269       /* Peeling and versioning can't be done together at this time.  */
2270       gcc_assert (! (do_peeling && do_versioning));
2271
2272       stat = vect_verify_datarefs_alignment (loop_vinfo);
2273       gcc_assert (stat);
2274       return stat;
2275     }
2276
2277   /* This point is reached if neither peeling nor versioning is being done.  */
2278   gcc_assert (! (do_peeling || do_versioning));
2279
2280   stat = vect_verify_datarefs_alignment (loop_vinfo);
2281   return stat;
2282 }
2283
2284
2285 /* Function vect_find_same_alignment_drs.
2286
2287    Update group and alignment relations according to the chosen
2288    vectorization factor.  */
2289
2290 static void
2291 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2292 {
2293   struct data_reference *dra = DDR_A (ddr);
2294   struct data_reference *drb = DDR_B (ddr);
2295   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2296   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2297
2298   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2299     return;
2300
2301   if (dra == drb)
2302     return;
2303
2304   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2305       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2306       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2307     return;
2308
2309   /* Two references with distance zero have the same alignment.  */
2310   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2311                           - wi::to_poly_offset (DR_INIT (drb)));
2312   if (maybe_ne (diff, 0))
2313     {
2314       /* Get the wider of the two alignments.  */
2315       unsigned int align_a = (vect_calculate_target_alignment (dra)
2316                               / BITS_PER_UNIT);
2317       unsigned int align_b = (vect_calculate_target_alignment (drb)
2318                               / BITS_PER_UNIT);
2319       unsigned int max_align = MAX (align_a, align_b);
2320
2321       /* Require the gap to be a multiple of the larger vector alignment.  */
2322       if (!multiple_p (diff, max_align))
2323         return;
2324     }
2325
2326   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2327   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2328   if (dump_enabled_p ())
2329     {
2330       dump_printf_loc (MSG_NOTE, vect_location,
2331                        "accesses have the same alignment: ");
2332       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2333       dump_printf (MSG_NOTE,  " and ");
2334       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2335       dump_printf (MSG_NOTE, "\n");
2336     }
2337 }
2338
2339
2340 /* Function vect_analyze_data_refs_alignment
2341
2342    Analyze the alignment of the data-references in the loop.
2343    Return FALSE if a data reference is found that cannot be vectorized.  */
2344
2345 bool
2346 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2347 {
2348   if (dump_enabled_p ())
2349     dump_printf_loc (MSG_NOTE, vect_location,
2350                      "=== vect_analyze_data_refs_alignment ===\n");
2351
2352   /* Mark groups of data references with same alignment using
2353      data dependence information.  */
2354   vec<ddr_p> ddrs = vinfo->ddrs;
2355   struct data_dependence_relation *ddr;
2356   unsigned int i;
2357
2358   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2359     vect_find_same_alignment_drs (ddr);
2360
2361   vec<data_reference_p> datarefs = vinfo->datarefs;
2362   struct data_reference *dr;
2363
2364   vect_record_base_alignments (vinfo);
2365   FOR_EACH_VEC_ELT (datarefs, i, dr)
2366     {
2367       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2368       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2369           && !vect_compute_data_ref_alignment (dr))
2370         {
2371           /* Strided accesses perform only component accesses, misalignment
2372              information is irrelevant for them.  */
2373           if (STMT_VINFO_STRIDED_P (stmt_info)
2374               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2375             continue;
2376
2377           if (dump_enabled_p ())
2378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379                              "not vectorized: can't calculate alignment "
2380                              "for data ref.\n");
2381
2382           return false;
2383         }
2384     }
2385
2386   return true;
2387 }
2388
2389
2390 /* Analyze alignment of DRs of stmts in NODE.  */
2391
2392 static bool
2393 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2394 {
2395   /* We vectorize from the first scalar stmt in the node unless
2396      the node is permuted in which case we start from the first
2397      element in the group.  */
2398   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2399   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2400   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2401     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2402
2403   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2404   if (! vect_compute_data_ref_alignment (dr)
2405       /* For creating the data-ref pointer we need alignment of the
2406          first element anyway.  */
2407       || (dr != first_dr
2408           && ! vect_compute_data_ref_alignment (first_dr))
2409       || ! verify_data_ref_alignment (dr))
2410     {
2411       if (dump_enabled_p ())
2412         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413                          "not vectorized: bad data alignment in basic "
2414                          "block.\n");
2415       return false;
2416     }
2417
2418   return true;
2419 }
2420
2421 /* Function vect_slp_analyze_instance_alignment
2422
2423    Analyze the alignment of the data-references in the SLP instance.
2424    Return FALSE if a data reference is found that cannot be vectorized.  */
2425
2426 bool
2427 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2428 {
2429   if (dump_enabled_p ())
2430     dump_printf_loc (MSG_NOTE, vect_location,
2431                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2432
2433   slp_tree node;
2434   unsigned i;
2435   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2436     if (! vect_slp_analyze_and_verify_node_alignment (node))
2437       return false;
2438
2439   node = SLP_INSTANCE_TREE (instance);
2440   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2441       && ! vect_slp_analyze_and_verify_node_alignment
2442              (SLP_INSTANCE_TREE (instance)))
2443     return false;
2444
2445   return true;
2446 }
2447
2448
2449 /* Analyze groups of accesses: check that DR belongs to a group of
2450    accesses of legal size, step, etc.  Detect gaps, single element
2451    interleaving, and other special cases. Set grouped access info.
2452    Collect groups of strided stores for further use in SLP analysis.
2453    Worker for vect_analyze_group_access.  */
2454
2455 static bool
2456 vect_analyze_group_access_1 (struct data_reference *dr)
2457 {
2458   tree step = DR_STEP (dr);
2459   tree scalar_type = TREE_TYPE (DR_REF (dr));
2460   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2461   gimple *stmt = DR_STMT (dr);
2462   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2463   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2464   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2465   HOST_WIDE_INT dr_step = -1;
2466   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2467   bool slp_impossible = false;
2468
2469   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2470      size of the interleaving group (including gaps).  */
2471   if (tree_fits_shwi_p (step))
2472     {
2473       dr_step = tree_to_shwi (step);
2474       /* Check that STEP is a multiple of type size.  Otherwise there is
2475          a non-element-sized gap at the end of the group which we
2476          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2477          ???  As we can handle non-constant step fine here we should
2478          simply remove uses of DR_GROUP_GAP between the last and first
2479          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2480          simply not include that gap.  */
2481       if ((dr_step % type_size) != 0)
2482         {
2483           if (dump_enabled_p ())
2484             {
2485               dump_printf_loc (MSG_NOTE, vect_location,
2486                                "Step ");
2487               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2488               dump_printf (MSG_NOTE,
2489                            " is not a multiple of the element size for ");
2490               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2491               dump_printf (MSG_NOTE, "\n");
2492             }
2493           return false;
2494         }
2495       groupsize = absu_hwi (dr_step) / type_size;
2496     }
2497   else
2498     groupsize = 0;
2499
2500   /* Not consecutive access is possible only if it is a part of interleaving.  */
2501   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2502     {
2503       /* Check if it this DR is a part of interleaving, and is a single
2504          element of the group that is accessed in the loop.  */
2505
2506       /* Gaps are supported only for loads. STEP must be a multiple of the type
2507          size.  */
2508       if (DR_IS_READ (dr)
2509           && (dr_step % type_size) == 0
2510           && groupsize > 0)
2511         {
2512           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2513           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2514           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2515           if (dump_enabled_p ())
2516             {
2517               dump_printf_loc (MSG_NOTE, vect_location,
2518                                "Detected single element interleaving ");
2519               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2520               dump_printf (MSG_NOTE, " step ");
2521               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2522               dump_printf (MSG_NOTE, "\n");
2523             }
2524
2525           return true;
2526         }
2527
2528       if (dump_enabled_p ())
2529         {
2530           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2531                            "not consecutive access ");
2532           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2533         }
2534
2535       if (bb_vinfo)
2536         {
2537           /* Mark the statement as unvectorizable.  */
2538           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2539           return true;
2540         }
2541
2542       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2543       STMT_VINFO_STRIDED_P (stmt_info) = true;
2544       return true;
2545     }
2546
2547   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2548     {
2549       /* First stmt in the interleaving chain. Check the chain.  */
2550       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2551       struct data_reference *data_ref = dr;
2552       unsigned int count = 1;
2553       tree prev_init = DR_INIT (data_ref);
2554       gimple *prev = stmt;
2555       HOST_WIDE_INT diff, gaps = 0;
2556
2557       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2558       while (next)
2559         {
2560           /* Skip same data-refs.  In case that two or more stmts share
2561              data-ref (supported only for loads), we vectorize only the first
2562              stmt, and the rest get their vectorized loads from the first
2563              one.  */
2564           if (!tree_int_cst_compare (DR_INIT (data_ref),
2565                                      DR_INIT (STMT_VINFO_DATA_REF (
2566                                                    vinfo_for_stmt (next)))))
2567             {
2568               if (DR_IS_WRITE (data_ref))
2569                 {
2570                   if (dump_enabled_p ())
2571                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2572                                      "Two store stmts share the same dr.\n");
2573                   return false;
2574                 }
2575
2576               if (dump_enabled_p ())
2577                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2578                                  "Two or more load stmts share the same dr.\n");
2579
2580               /* For load use the same data-ref load.  */
2581               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2582
2583               prev = next;
2584               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2585               continue;
2586             }
2587
2588           prev = next;
2589           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2590
2591           /* All group members have the same STEP by construction.  */
2592           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2593
2594           /* Check that the distance between two accesses is equal to the type
2595              size. Otherwise, we have gaps.  */
2596           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2597                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2598           if (diff != 1)
2599             {
2600               /* FORNOW: SLP of accesses with gaps is not supported.  */
2601               slp_impossible = true;
2602               if (DR_IS_WRITE (data_ref))
2603                 {
2604                   if (dump_enabled_p ())
2605                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2606                                      "interleaved store with gaps\n");
2607                   return false;
2608                 }
2609
2610               gaps += diff - 1;
2611             }
2612
2613           last_accessed_element += diff;
2614
2615           /* Store the gap from the previous member of the group. If there is no
2616              gap in the access, DR_GROUP_GAP is always 1.  */
2617           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2618
2619           prev_init = DR_INIT (data_ref);
2620           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2621           /* Count the number of data-refs in the chain.  */
2622           count++;
2623         }
2624
2625       if (groupsize == 0)
2626         groupsize = count + gaps;
2627
2628       /* This could be UINT_MAX but as we are generating code in a very
2629          inefficient way we have to cap earlier.  See PR78699 for example.  */
2630       if (groupsize > 4096)
2631         {
2632           if (dump_enabled_p ())
2633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2634                              "group is too large\n");
2635           return false;
2636         }
2637
2638       /* Check that the size of the interleaving is equal to count for stores,
2639          i.e., that there are no gaps.  */
2640       if (groupsize != count
2641           && !DR_IS_READ (dr))
2642         {
2643           if (dump_enabled_p ())
2644             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645                              "interleaved store with gaps\n");
2646           return false;
2647         }
2648
2649       /* If there is a gap after the last load in the group it is the
2650          difference between the groupsize and the last accessed
2651          element.
2652          When there is no gap, this difference should be 0.  */
2653       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2654
2655       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2656       if (dump_enabled_p ())
2657         {
2658           dump_printf_loc (MSG_NOTE, vect_location,
2659                            "Detected interleaving ");
2660           if (DR_IS_READ (dr))
2661             dump_printf (MSG_NOTE, "load ");
2662           else
2663             dump_printf (MSG_NOTE, "store ");
2664           dump_printf (MSG_NOTE, "of size %u starting with ",
2665                        (unsigned)groupsize);
2666           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2667           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2668             dump_printf_loc (MSG_NOTE, vect_location,
2669                              "There is a gap of %u elements after the group\n",
2670                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2671         }
2672
2673       /* SLP: create an SLP data structure for every interleaving group of
2674          stores for further analysis in vect_analyse_slp.  */
2675       if (DR_IS_WRITE (dr) && !slp_impossible)
2676         {
2677           if (loop_vinfo)
2678             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2679           if (bb_vinfo)
2680             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2681         }
2682     }
2683
2684   return true;
2685 }
2686
2687 /* Analyze groups of accesses: check that DR belongs to a group of
2688    accesses of legal size, step, etc.  Detect gaps, single element
2689    interleaving, and other special cases. Set grouped access info.
2690    Collect groups of strided stores for further use in SLP analysis.  */
2691
2692 static bool
2693 vect_analyze_group_access (struct data_reference *dr)
2694 {
2695   if (!vect_analyze_group_access_1 (dr))
2696     {
2697       /* Dissolve the group if present.  */
2698       gimple *next;
2699       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2700       while (stmt)
2701         {
2702           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2703           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2704           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2705           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2706           stmt = next;
2707         }
2708       return false;
2709     }
2710   return true;
2711 }
2712
2713 /* Analyze the access pattern of the data-reference DR.
2714    In case of non-consecutive accesses call vect_analyze_group_access() to
2715    analyze groups of accesses.  */
2716
2717 static bool
2718 vect_analyze_data_ref_access (struct data_reference *dr)
2719 {
2720   tree step = DR_STEP (dr);
2721   tree scalar_type = TREE_TYPE (DR_REF (dr));
2722   gimple *stmt = DR_STMT (dr);
2723   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2724   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2725   struct loop *loop = NULL;
2726
2727   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2728     return true;
2729
2730   if (loop_vinfo)
2731     loop = LOOP_VINFO_LOOP (loop_vinfo);
2732
2733   if (loop_vinfo && !step)
2734     {
2735       if (dump_enabled_p ())
2736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737                          "bad data-ref access in loop\n");
2738       return false;
2739     }
2740
2741   /* Allow loads with zero step in inner-loop vectorization.  */
2742   if (loop_vinfo && integer_zerop (step))
2743     {
2744       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2745       if (!nested_in_vect_loop_p (loop, stmt))
2746         return DR_IS_READ (dr);
2747       /* Allow references with zero step for outer loops marked
2748          with pragma omp simd only - it guarantees absence of
2749          loop-carried dependencies between inner loop iterations.  */
2750       if (loop->safelen < 2)
2751         {
2752           if (dump_enabled_p ())
2753             dump_printf_loc (MSG_NOTE, vect_location,
2754                              "zero step in inner loop of nest\n");
2755           return false;
2756         }
2757     }
2758
2759   if (loop && nested_in_vect_loop_p (loop, stmt))
2760     {
2761       /* Interleaved accesses are not yet supported within outer-loop
2762         vectorization for references in the inner-loop.  */
2763       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2764
2765       /* For the rest of the analysis we use the outer-loop step.  */
2766       step = STMT_VINFO_DR_STEP (stmt_info);
2767       if (integer_zerop (step))
2768         {
2769           if (dump_enabled_p ())
2770             dump_printf_loc (MSG_NOTE, vect_location,
2771                              "zero step in outer loop.\n");
2772           return DR_IS_READ (dr);
2773         }
2774     }
2775
2776   /* Consecutive?  */
2777   if (TREE_CODE (step) == INTEGER_CST)
2778     {
2779       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2780       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2781           || (dr_step < 0
2782               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2783         {
2784           /* Mark that it is not interleaving.  */
2785           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2786           return true;
2787         }
2788     }
2789
2790   if (loop && nested_in_vect_loop_p (loop, stmt))
2791     {
2792       if (dump_enabled_p ())
2793         dump_printf_loc (MSG_NOTE, vect_location,
2794                          "grouped access in outer loop.\n");
2795       return false;
2796     }
2797
2798
2799   /* Assume this is a DR handled by non-constant strided load case.  */
2800   if (TREE_CODE (step) != INTEGER_CST)
2801     return (STMT_VINFO_STRIDED_P (stmt_info)
2802             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2803                 || vect_analyze_group_access (dr)));
2804
2805   /* Not consecutive access - check if it's a part of interleaving group.  */
2806   return vect_analyze_group_access (dr);
2807 }
2808
2809 /* Compare two data-references DRA and DRB to group them into chunks
2810    suitable for grouping.  */
2811
2812 static int
2813 dr_group_sort_cmp (const void *dra_, const void *drb_)
2814 {
2815   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2816   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2817   int cmp;
2818
2819   /* Stabilize sort.  */
2820   if (dra == drb)
2821     return 0;
2822
2823   /* DRs in different loops never belong to the same group.  */
2824   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2825   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2826   if (loopa != loopb)
2827     return loopa->num < loopb->num ? -1 : 1;
2828
2829   /* Ordering of DRs according to base.  */
2830   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2831                                DR_BASE_ADDRESS (drb));
2832   if (cmp != 0)
2833     return cmp;
2834
2835   /* And according to DR_OFFSET.  */
2836   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2837   if (cmp != 0)
2838     return cmp;
2839
2840   /* Put reads before writes.  */
2841   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2842     return DR_IS_READ (dra) ? -1 : 1;
2843
2844   /* Then sort after access size.  */
2845   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2846                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2847   if (cmp != 0)
2848     return cmp;
2849
2850   /* And after step.  */
2851   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2852   if (cmp != 0)
2853     return cmp;
2854
2855   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2856   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2857   if (cmp == 0)
2858     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2859   return cmp;
2860 }
2861
2862 /* If OP is the result of a conversion, return the unconverted value,
2863    otherwise return null.  */
2864
2865 static tree
2866 strip_conversion (tree op)
2867 {
2868   if (TREE_CODE (op) != SSA_NAME)
2869     return NULL_TREE;
2870   gimple *stmt = SSA_NAME_DEF_STMT (op);
2871   if (!is_gimple_assign (stmt)
2872       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2873     return NULL_TREE;
2874   return gimple_assign_rhs1 (stmt);
2875 }
2876
2877 /* Return true if vectorizable_* routines can handle statements STMT1
2878    and STMT2 being in a single group.  */
2879
2880 static bool
2881 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2882 {
2883   if (gimple_assign_single_p (stmt1))
2884     return gimple_assign_single_p (stmt2);
2885
2886   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2887     {
2888       /* Check for two masked loads or two masked stores.  */
2889       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2890         return false;
2891       internal_fn ifn = gimple_call_internal_fn (stmt1);
2892       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2893         return false;
2894       if (ifn != gimple_call_internal_fn (stmt2))
2895         return false;
2896
2897       /* Check that the masks are the same.  Cope with casts of masks,
2898          like those created by build_mask_conversion.  */
2899       tree mask1 = gimple_call_arg (stmt1, 2);
2900       tree mask2 = gimple_call_arg (stmt2, 2);
2901       if (!operand_equal_p (mask1, mask2, 0))
2902         {
2903           mask1 = strip_conversion (mask1);
2904           if (!mask1)
2905             return false;
2906           mask2 = strip_conversion (mask2);
2907           if (!mask2)
2908             return false;
2909           if (!operand_equal_p (mask1, mask2, 0))
2910             return false;
2911         }
2912       return true;
2913     }
2914
2915   return false;
2916 }
2917
2918 /* Function vect_analyze_data_ref_accesses.
2919
2920    Analyze the access pattern of all the data references in the loop.
2921
2922    FORNOW: the only access pattern that is considered vectorizable is a
2923            simple step 1 (consecutive) access.
2924
2925    FORNOW: handle only arrays and pointer accesses.  */
2926
2927 bool
2928 vect_analyze_data_ref_accesses (vec_info *vinfo)
2929 {
2930   unsigned int i;
2931   vec<data_reference_p> datarefs = vinfo->datarefs;
2932   struct data_reference *dr;
2933
2934   if (dump_enabled_p ())
2935     dump_printf_loc (MSG_NOTE, vect_location,
2936                      "=== vect_analyze_data_ref_accesses ===\n");
2937
2938   if (datarefs.is_empty ())
2939     return true;
2940
2941   /* Sort the array of datarefs to make building the interleaving chains
2942      linear.  Don't modify the original vector's order, it is needed for
2943      determining what dependencies are reversed.  */
2944   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2945   datarefs_copy.qsort (dr_group_sort_cmp);
2946
2947   /* Build the interleaving chains.  */
2948   for (i = 0; i < datarefs_copy.length () - 1;)
2949     {
2950       data_reference_p dra = datarefs_copy[i];
2951       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2952       stmt_vec_info lastinfo = NULL;
2953       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2954           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2955         {
2956           ++i;
2957           continue;
2958         }
2959       for (i = i + 1; i < datarefs_copy.length (); ++i)
2960         {
2961           data_reference_p drb = datarefs_copy[i];
2962           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2963           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2964               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2965             break;
2966
2967           /* ???  Imperfect sorting (non-compatible types, non-modulo
2968              accesses, same accesses) can lead to a group to be artificially
2969              split here as we don't just skip over those.  If it really
2970              matters we can push those to a worklist and re-iterate
2971              over them.  The we can just skip ahead to the next DR here.  */
2972
2973           /* DRs in a different loop should not be put into the same
2974              interleaving group.  */
2975           if (gimple_bb (DR_STMT (dra))->loop_father
2976               != gimple_bb (DR_STMT (drb))->loop_father)
2977             break;
2978
2979           /* Check that the data-refs have same first location (except init)
2980              and they are both either store or load (not load and store,
2981              not masked loads or stores).  */
2982           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2983               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2984                                         DR_BASE_ADDRESS (drb)) != 0
2985               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2986               || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2987             break;
2988
2989           /* Check that the data-refs have the same constant size.  */
2990           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2991           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2992           if (!tree_fits_uhwi_p (sza)
2993               || !tree_fits_uhwi_p (szb)
2994               || !tree_int_cst_equal (sza, szb))
2995             break;
2996
2997           /* Check that the data-refs have the same step.  */
2998           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2999             break;
3000
3001           /* Check the types are compatible.
3002              ???  We don't distinguish this during sorting.  */
3003           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3004                                    TREE_TYPE (DR_REF (drb))))
3005             break;
3006
3007           /* Check that the DR_INITs are compile-time constants.  */
3008           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3009               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3010             break;
3011
3012           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3013           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3014           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3015           HOST_WIDE_INT init_prev
3016             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3017           gcc_assert (init_a <= init_b
3018                       && init_a <= init_prev
3019                       && init_prev <= init_b);
3020
3021           /* Do not place the same access in the interleaving chain twice.  */
3022           if (init_b == init_prev)
3023             {
3024               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3025                           < gimple_uid (DR_STMT (drb)));
3026               /* ???  For now we simply "drop" the later reference which is
3027                  otherwise the same rather than finishing off this group.
3028                  In the end we'd want to re-process duplicates forming
3029                  multiple groups from the refs, likely by just collecting
3030                  all candidates (including duplicates and split points
3031                  below) in a vector and then process them together.  */
3032               continue;
3033             }
3034
3035           /* If init_b == init_a + the size of the type * k, we have an
3036              interleaving, and DRA is accessed before DRB.  */
3037           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3038           if (type_size_a == 0
3039               || (init_b - init_a) % type_size_a != 0)
3040             break;
3041
3042           /* If we have a store, the accesses are adjacent.  This splits
3043              groups into chunks we support (we don't support vectorization
3044              of stores with gaps).  */
3045           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3046             break;
3047
3048           /* If the step (if not zero or non-constant) is greater than the
3049              difference between data-refs' inits this splits groups into
3050              suitable sizes.  */
3051           if (tree_fits_shwi_p (DR_STEP (dra)))
3052             {
3053               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3054               if (step != 0 && step <= (init_b - init_a))
3055                 break;
3056             }
3057
3058           if (dump_enabled_p ())
3059             {
3060               dump_printf_loc (MSG_NOTE, vect_location,
3061                                "Detected interleaving ");
3062               if (DR_IS_READ (dra))
3063                 dump_printf (MSG_NOTE, "load ");
3064               else
3065                 dump_printf (MSG_NOTE, "store ");
3066               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3067               dump_printf (MSG_NOTE,  " and ");
3068               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3069               dump_printf (MSG_NOTE, "\n");
3070             }
3071
3072           /* Link the found element into the group list.  */
3073           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3074             {
3075               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
3076               lastinfo = stmtinfo_a;
3077             }
3078           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
3079           DR_GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
3080           lastinfo = stmtinfo_b;
3081         }
3082     }
3083
3084   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3085     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3086         && !vect_analyze_data_ref_access (dr))
3087       {
3088         if (dump_enabled_p ())
3089           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090                            "not vectorized: complicated access pattern.\n");
3091
3092         if (is_a <bb_vec_info> (vinfo))
3093           {
3094             /* Mark the statement as not vectorizable.  */
3095             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3096             continue;
3097           }
3098         else
3099           {
3100             datarefs_copy.release ();
3101             return false;
3102           }
3103       }
3104
3105   datarefs_copy.release ();
3106   return true;
3107 }
3108
3109 /* Function vect_vfa_segment_size.
3110
3111    Input:
3112      DR: The data reference.
3113      LENGTH_FACTOR: segment length to consider.
3114
3115    Return a value suitable for the dr_with_seg_len::seg_len field.
3116    This is the "distance travelled" by the pointer from the first
3117    iteration in the segment to the last.  Note that it does not include
3118    the size of the access; in effect it only describes the first byte.  */
3119
3120 static tree
3121 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3122 {
3123   length_factor = size_binop (MINUS_EXPR,
3124                               fold_convert (sizetype, length_factor),
3125                               size_one_node);
3126   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3127                      length_factor);
3128 }
3129
3130 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3131    gives the worst-case number of bytes covered by the segment.  */
3132
3133 static unsigned HOST_WIDE_INT
3134 vect_vfa_access_size (data_reference *dr)
3135 {
3136   stmt_vec_info stmt_vinfo = vinfo_for_stmt (DR_STMT (dr));
3137   tree ref_type = TREE_TYPE (DR_REF (dr));
3138   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3139   unsigned HOST_WIDE_INT access_size = ref_size;
3140   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3141     {
3142       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == DR_STMT (dr));
3143       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3144     }
3145   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3146       && (vect_supportable_dr_alignment (dr, false)
3147           == dr_explicit_realign_optimized))
3148     {
3149       /* We might access a full vector's worth.  */
3150       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3151       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3152     }
3153   return access_size;
3154 }
3155
3156 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3157
3158 static unsigned int
3159 vect_vfa_align (const data_reference *dr)
3160 {
3161   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3162 }
3163
3164 /* Function vect_no_alias_p.
3165
3166    Given data references A and B with equal base and offset, see whether
3167    the alias relation can be decided at compilation time.  Return 1 if
3168    it can and the references alias, 0 if it can and the references do
3169    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3170    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3171    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3172
3173 static int
3174 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3175                          tree segment_length_a, tree segment_length_b,
3176                          unsigned HOST_WIDE_INT access_size_a,
3177                          unsigned HOST_WIDE_INT access_size_b)
3178 {
3179   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3180   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3181   poly_uint64 const_length_a;
3182   poly_uint64 const_length_b;
3183
3184   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3185      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3186      [a, a+12) */
3187   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3188     {
3189       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3190       offset_a = (offset_a + access_size_a) - const_length_a;
3191     }
3192   else
3193     const_length_a = tree_to_poly_uint64 (segment_length_a);
3194   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3195     {
3196       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3197       offset_b = (offset_b + access_size_b) - const_length_b;
3198     }
3199   else
3200     const_length_b = tree_to_poly_uint64 (segment_length_b);
3201
3202   const_length_a += access_size_a;
3203   const_length_b += access_size_b;
3204
3205   if (ranges_known_overlap_p (offset_a, const_length_a,
3206                               offset_b, const_length_b))
3207     return 1;
3208
3209   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3210                                offset_b, const_length_b))
3211     return 0;
3212
3213   return -1;
3214 }
3215
3216 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3217    in DDR is >= VF.  */
3218
3219 static bool
3220 dependence_distance_ge_vf (data_dependence_relation *ddr,
3221                            unsigned int loop_depth, poly_uint64 vf)
3222 {
3223   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3224       || DDR_NUM_DIST_VECTS (ddr) == 0)
3225     return false;
3226
3227   /* If the dependence is exact, we should have limited the VF instead.  */
3228   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3229
3230   unsigned int i;
3231   lambda_vector dist_v;
3232   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3233     {
3234       HOST_WIDE_INT dist = dist_v[loop_depth];
3235       if (dist != 0
3236           && !(dist > 0 && DDR_REVERSED_P (ddr))
3237           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3238         return false;
3239     }
3240
3241   if (dump_enabled_p ())
3242     {
3243       dump_printf_loc (MSG_NOTE, vect_location,
3244                        "dependence distance between ");
3245       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3246       dump_printf (MSG_NOTE,  " and ");
3247       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3248       dump_printf (MSG_NOTE,  " is >= VF\n");
3249     }
3250
3251   return true;
3252 }
3253
3254 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3255
3256 static void
3257 dump_lower_bound (int dump_kind, const vec_lower_bound &lower_bound)
3258 {
3259   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3260   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3261   dump_printf (dump_kind, ") >= ");
3262   dump_dec (dump_kind, lower_bound.min_value);
3263 }
3264
3265 /* Record that the vectorized loop requires the vec_lower_bound described
3266    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3267
3268 static void
3269 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3270                         poly_uint64 min_value)
3271 {
3272   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3273   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3274     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3275       {
3276         unsigned_p &= lower_bounds[i].unsigned_p;
3277         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3278         if (lower_bounds[i].unsigned_p != unsigned_p
3279             || maybe_lt (lower_bounds[i].min_value, min_value))
3280           {
3281             lower_bounds[i].unsigned_p = unsigned_p;
3282             lower_bounds[i].min_value = min_value;
3283             if (dump_enabled_p ())
3284               {
3285                 dump_printf_loc (MSG_NOTE, vect_location,
3286                                  "updating run-time check to ");
3287                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3288                 dump_printf (MSG_NOTE, "\n");
3289               }
3290           }
3291         return;
3292       }
3293
3294   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3295   if (dump_enabled_p ())
3296     {
3297       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3298       dump_lower_bound (MSG_NOTE, lower_bound);
3299       dump_printf (MSG_NOTE, "\n");
3300     }
3301   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3302 }
3303
3304 /* Return true if it's unlikely that the step of the vectorized form of DR
3305    will span fewer than GAP bytes.  */
3306
3307 static bool
3308 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3309 {
3310   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
3311   HOST_WIDE_INT count
3312     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3313   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3314     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3315   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3316 }
3317
3318 /* Return true if we know that there is no alias between DR_A and DR_B
3319    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3320    *LOWER_BOUND_OUT to this N.  */
3321
3322 static bool
3323 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3324                                 poly_uint64 *lower_bound_out)
3325 {
3326   /* Check that there is a constant gap of known sign between DR_A
3327      and DR_B.  */
3328   poly_int64 init_a, init_b;
3329   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3330       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3331       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3332       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3333       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3334       || !ordered_p (init_a, init_b))
3335     return false;
3336
3337   /* Sort DR_A and DR_B by the address they access.  */
3338   if (maybe_lt (init_b, init_a))
3339     {
3340       std::swap (init_a, init_b);
3341       std::swap (dr_a, dr_b);
3342     }
3343
3344   /* If the two accesses could be dependent within a scalar iteration,
3345      make sure that we'd retain their order.  */
3346   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3347       && !vect_preserves_scalar_order_p (DR_STMT (dr_a), DR_STMT (dr_b)))
3348     return false;
3349
3350   /* There is no alias if abs (DR_STEP) is greater than or equal to
3351      the bytes spanned by the combination of the two accesses.  */
3352   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3353   return true;
3354 }
3355
3356 /* Function vect_prune_runtime_alias_test_list.
3357
3358    Prune a list of ddrs to be tested at run-time by versioning for alias.
3359    Merge several alias checks into one if possible.
3360    Return FALSE if resulting list of ddrs is longer then allowed by
3361    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3362
3363 bool
3364 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3365 {
3366   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3367   hash_set <tree_pair_hash> compared_objects;
3368
3369   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3370   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3371     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3372   vec<vec_object_pair> &check_unequal_addrs
3373     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3374   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3375   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3376
3377   ddr_p ddr;
3378   unsigned int i;
3379   tree length_factor;
3380
3381   if (dump_enabled_p ())
3382     dump_printf_loc (MSG_NOTE, vect_location,
3383                      "=== vect_prune_runtime_alias_test_list ===\n");
3384
3385   /* Step values are irrelevant for aliasing if the number of vector
3386      iterations is equal to the number of scalar iterations (which can
3387      happen for fully-SLP loops).  */
3388   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3389
3390   if (!ignore_step_p)
3391     {
3392       /* Convert the checks for nonzero steps into bound tests.  */
3393       tree value;
3394       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3395         vect_check_lower_bound (loop_vinfo, value, true, 1);
3396     }
3397
3398   if (may_alias_ddrs.is_empty ())
3399     return true;
3400
3401   comp_alias_ddrs.create (may_alias_ddrs.length ());
3402
3403   unsigned int loop_depth
3404     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3405                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3406
3407   /* First, we collect all data ref pairs for aliasing checks.  */
3408   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3409     {
3410       int comp_res;
3411       poly_uint64 lower_bound;
3412       struct data_reference *dr_a, *dr_b;
3413       gimple *dr_group_first_a, *dr_group_first_b;
3414       tree segment_length_a, segment_length_b;
3415       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3416       unsigned int align_a, align_b;
3417       gimple *stmt_a, *stmt_b;
3418
3419       /* Ignore the alias if the VF we chose ended up being no greater
3420          than the dependence distance.  */
3421       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3422         continue;
3423
3424       if (DDR_OBJECT_A (ddr))
3425         {
3426           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3427           if (!compared_objects.add (new_pair))
3428             {
3429               if (dump_enabled_p ())
3430                 {
3431                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3432                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3433                   dump_printf (MSG_NOTE, " and ");
3434                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3435                   dump_printf (MSG_NOTE, " have different addresses\n");
3436                 }
3437               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3438             }
3439           continue;
3440         }
3441
3442       dr_a = DDR_A (ddr);
3443       stmt_a = DR_STMT (DDR_A (ddr));
3444
3445       dr_b = DDR_B (ddr);
3446       stmt_b = DR_STMT (DDR_B (ddr));
3447
3448       /* Skip the pair if inter-iteration dependencies are irrelevant
3449          and intra-iteration dependencies are guaranteed to be honored.  */
3450       if (ignore_step_p
3451           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3452               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3453         {
3454           if (dump_enabled_p ())
3455             {
3456               dump_printf_loc (MSG_NOTE, vect_location,
3457                                "no need for alias check between ");
3458               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3459               dump_printf (MSG_NOTE, " and ");
3460               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3461               dump_printf (MSG_NOTE, " when VF is 1\n");
3462             }
3463           continue;
3464         }
3465
3466       /* See whether we can handle the alias using a bounds check on
3467          the step, and whether that's likely to be the best approach.
3468          (It might not be, for example, if the minimum step is much larger
3469          than the number of bytes handled by one vector iteration.)  */
3470       if (!ignore_step_p
3471           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3472           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3473           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3474               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3475         {
3476           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3477           if (dump_enabled_p ())
3478             {
3479               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3480               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3481               dump_printf (MSG_NOTE, " and ");
3482               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3483               dump_printf (MSG_NOTE, " when the step ");
3484               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3485               dump_printf (MSG_NOTE, " is outside ");
3486               if (unsigned_p)
3487                 dump_printf (MSG_NOTE, "[0");
3488               else
3489                 {
3490                   dump_printf (MSG_NOTE, "(");
3491                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3492                 }
3493               dump_printf (MSG_NOTE, ", ");
3494               dump_dec (MSG_NOTE, lower_bound);
3495               dump_printf (MSG_NOTE, ")\n");
3496             }
3497           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3498                                   lower_bound);
3499           continue;
3500         }
3501
3502       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3503       if (dr_group_first_a)
3504         {
3505           stmt_a = dr_group_first_a;
3506           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3507         }
3508
3509       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3510       if (dr_group_first_b)
3511         {
3512           stmt_b = dr_group_first_b;
3513           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3514         }
3515
3516       if (ignore_step_p)
3517         {
3518           segment_length_a = size_zero_node;
3519           segment_length_b = size_zero_node;
3520         }
3521       else
3522         {
3523           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3524             length_factor = scalar_loop_iters;
3525           else
3526             length_factor = size_int (vect_factor);
3527           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3528           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3529         }
3530       access_size_a = vect_vfa_access_size (dr_a);
3531       access_size_b = vect_vfa_access_size (dr_b);
3532       align_a = vect_vfa_align (dr_a);
3533       align_b = vect_vfa_align (dr_b);
3534
3535       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3536                                         DR_BASE_ADDRESS (dr_b));
3537       if (comp_res == 0)
3538         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3539                                           DR_OFFSET (dr_b));
3540
3541       /* See whether the alias is known at compilation time.  */
3542       if (comp_res == 0
3543           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3544           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3545           && poly_int_tree_p (segment_length_a)
3546           && poly_int_tree_p (segment_length_b))
3547         {
3548           int res = vect_compile_time_alias (dr_a, dr_b,
3549                                              segment_length_a,
3550                                              segment_length_b,
3551                                              access_size_a,
3552                                              access_size_b);
3553           if (res >= 0 && dump_enabled_p ())
3554             {
3555               dump_printf_loc (MSG_NOTE, vect_location,
3556                                "can tell at compile time that ");
3557               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3558               dump_printf (MSG_NOTE, " and ");
3559               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3560               if (res == 0)
3561                 dump_printf (MSG_NOTE, " do not alias\n");
3562               else
3563                 dump_printf (MSG_NOTE, " alias\n");
3564             }
3565
3566           if (res == 0)
3567             continue;
3568
3569           if (res == 1)
3570             {
3571               if (dump_enabled_p ())
3572                 dump_printf_loc (MSG_NOTE, vect_location,
3573                                  "not vectorized: compilation time alias.\n");
3574               return false;
3575             }
3576         }
3577
3578       dr_with_seg_len_pair_t dr_with_seg_len_pair
3579         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3580          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3581
3582       /* Canonicalize pairs by sorting the two DR members.  */
3583       if (comp_res > 0)
3584         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3585
3586       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3587     }
3588
3589   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3590
3591   unsigned int count = (comp_alias_ddrs.length ()
3592                         + check_unequal_addrs.length ());
3593
3594   dump_printf_loc (MSG_NOTE, vect_location,
3595                    "improved number of alias checks from %d to %d\n",
3596                    may_alias_ddrs.length (), count);
3597   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3598     {
3599       if (dump_enabled_p ())
3600         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3601                          "number of versioning for alias "
3602                          "run-time tests exceeds %d "
3603                          "(--param vect-max-version-for-alias-checks)\n",
3604                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3605       return false;
3606     }
3607
3608   return true;
3609 }
3610
3611 /* Check whether we can use an internal function for a gather load
3612    or scatter store.  READ_P is true for loads and false for stores.
3613    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3614    the type of the memory elements being loaded or stored.  OFFSET_BITS
3615    is the number of bits in each scalar offset and OFFSET_SIGN is the
3616    sign of the offset.  SCALE is the amount by which the offset should
3617    be multiplied *after* it has been converted to address width.
3618
3619    Return true if the function is supported, storing the function
3620    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3621
3622 bool
3623 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3624                           tree memory_type, unsigned int offset_bits,
3625                           signop offset_sign, int scale,
3626                           internal_fn *ifn_out, tree *element_type_out)
3627 {
3628   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3629   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3630   if (offset_bits > element_bits)
3631     /* Internal functions require the offset to be the same width as
3632        the vector elements.  We can extend narrower offsets, but it isn't
3633        safe to truncate wider offsets.  */
3634     return false;
3635
3636   if (element_bits != memory_bits)
3637     /* For now the vector elements must be the same width as the
3638        memory elements.  */
3639     return false;
3640
3641   /* Work out which function we need.  */
3642   internal_fn ifn;
3643   if (read_p)
3644     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3645   else
3646     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3647
3648   /* Test whether the target supports this combination.  */
3649   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3650                                                offset_sign, scale))
3651     return false;
3652
3653   *ifn_out = ifn;
3654   *element_type_out = TREE_TYPE (vectype);
3655   return true;
3656 }
3657
3658 /* CALL is a call to an internal gather load or scatter store function.
3659    Describe the operation in INFO.  */
3660
3661 static void
3662 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3663 {
3664   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3665   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3666   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3667
3668   info->ifn = gimple_call_internal_fn (call);
3669   info->decl = NULL_TREE;
3670   info->base = gimple_call_arg (call, 0);
3671   info->offset = gimple_call_arg (call, 1);
3672   info->offset_dt = vect_unknown_def_type;
3673   info->offset_vectype = NULL_TREE;
3674   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3675   info->element_type = TREE_TYPE (vectype);
3676   info->memory_type = TREE_TYPE (DR_REF (dr));
3677 }
3678
3679 /* Return true if a non-affine read or write in STMT is suitable for a
3680    gather load or scatter store.  Describe the operation in *INFO if so.  */
3681
3682 bool
3683 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3684                            gather_scatter_info *info)
3685 {
3686   HOST_WIDE_INT scale = 1;
3687   poly_int64 pbitpos, pbitsize;
3688   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3689   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3690   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3691   tree offtype = NULL_TREE;
3692   tree decl = NULL_TREE, base, off;
3693   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3694   tree memory_type = TREE_TYPE (DR_REF (dr));
3695   machine_mode pmode;
3696   int punsignedp, reversep, pvolatilep = 0;
3697   internal_fn ifn;
3698   tree element_type;
3699   bool masked_p = false;
3700
3701   /* See whether this is already a call to a gather/scatter internal function.
3702      If not, see whether it's a masked load or store.  */
3703   gcall *call = dyn_cast <gcall *> (stmt);
3704   if (call && gimple_call_internal_p (call))
3705     {
3706       ifn = gimple_call_internal_fn (stmt);
3707       if (internal_gather_scatter_fn_p (ifn))
3708         {
3709           vect_describe_gather_scatter_call (call, info);
3710           return true;
3711         }
3712       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3713     }
3714
3715   /* True if we should aim to use internal functions rather than
3716      built-in functions.  */
3717   bool use_ifn_p = (DR_IS_READ (dr)
3718                     ? supports_vec_gather_load_p ()
3719                     : supports_vec_scatter_store_p ());
3720
3721   base = DR_REF (dr);
3722   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3723      see if we can use the def stmt of the address.  */
3724   if (masked_p
3725       && TREE_CODE (base) == MEM_REF
3726       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3727       && integer_zerop (TREE_OPERAND (base, 1))
3728       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3729     {
3730       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3731       if (is_gimple_assign (def_stmt)
3732           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3733         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3734     }
3735
3736   /* The gather and scatter builtins need address of the form
3737      loop_invariant + vector * {1, 2, 4, 8}
3738      or
3739      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3740      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3741      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3742      multiplications and additions in it.  To get a vector, we need
3743      a single SSA_NAME that will be defined in the loop and will
3744      contain everything that is not loop invariant and that can be
3745      vectorized.  The following code attempts to find such a preexistng
3746      SSA_NAME OFF and put the loop invariants into a tree BASE
3747      that can be gimplified before the loop.  */
3748   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3749                               &punsignedp, &reversep, &pvolatilep);
3750   gcc_assert (base && !reversep);
3751   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3752
3753   if (TREE_CODE (base) == MEM_REF)
3754     {
3755       if (!integer_zerop (TREE_OPERAND (base, 1)))
3756         {
3757           if (off == NULL_TREE)
3758             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3759           else
3760             off = size_binop (PLUS_EXPR, off,
3761                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3762         }
3763       base = TREE_OPERAND (base, 0);
3764     }
3765   else
3766     base = build_fold_addr_expr (base);
3767
3768   if (off == NULL_TREE)
3769     off = size_zero_node;
3770
3771   /* If base is not loop invariant, either off is 0, then we start with just
3772      the constant offset in the loop invariant BASE and continue with base
3773      as OFF, otherwise give up.
3774      We could handle that case by gimplifying the addition of base + off
3775      into some SSA_NAME and use that as off, but for now punt.  */
3776   if (!expr_invariant_in_loop_p (loop, base))
3777     {
3778       if (!integer_zerop (off))
3779         return false;
3780       off = base;
3781       base = size_int (pbytepos);
3782     }
3783   /* Otherwise put base + constant offset into the loop invariant BASE
3784      and continue with OFF.  */
3785   else
3786     {
3787       base = fold_convert (sizetype, base);
3788       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3789     }
3790
3791   /* OFF at this point may be either a SSA_NAME or some tree expression
3792      from get_inner_reference.  Try to peel off loop invariants from it
3793      into BASE as long as possible.  */
3794   STRIP_NOPS (off);
3795   while (offtype == NULL_TREE)
3796     {
3797       enum tree_code code;
3798       tree op0, op1, add = NULL_TREE;
3799
3800       if (TREE_CODE (off) == SSA_NAME)
3801         {
3802           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3803
3804           if (expr_invariant_in_loop_p (loop, off))
3805             return false;
3806
3807           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3808             break;
3809
3810           op0 = gimple_assign_rhs1 (def_stmt);
3811           code = gimple_assign_rhs_code (def_stmt);
3812           op1 = gimple_assign_rhs2 (def_stmt);
3813         }
3814       else
3815         {
3816           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3817             return false;
3818           code = TREE_CODE (off);
3819           extract_ops_from_tree (off, &code, &op0, &op1);
3820         }
3821       switch (code)
3822         {
3823         case POINTER_PLUS_EXPR:
3824         case PLUS_EXPR:
3825           if (expr_invariant_in_loop_p (loop, op0))
3826             {
3827               add = op0;
3828               off = op1;
3829             do_add:
3830               add = fold_convert (sizetype, add);
3831               if (scale != 1)
3832                 add = size_binop (MULT_EXPR, add, size_int (scale));
3833               base = size_binop (PLUS_EXPR, base, add);
3834               continue;
3835             }
3836           if (expr_invariant_in_loop_p (loop, op1))
3837             {
3838               add = op1;
3839               off = op0;
3840               goto do_add;
3841             }
3842           break;
3843         case MINUS_EXPR:
3844           if (expr_invariant_in_loop_p (loop, op1))
3845             {
3846               add = fold_convert (sizetype, op1);
3847               add = size_binop (MINUS_EXPR, size_zero_node, add);
3848               off = op0;
3849               goto do_add;
3850             }
3851           break;
3852         case MULT_EXPR:
3853           if (scale == 1 && tree_fits_shwi_p (op1))
3854             {
3855               int new_scale = tree_to_shwi (op1);
3856               /* Only treat this as a scaling operation if the target
3857                  supports it.  */
3858               if (use_ifn_p
3859                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3860                                                 vectype, memory_type, 1,
3861                                                 TYPE_SIGN (TREE_TYPE (op0)),
3862                                                 new_scale, &ifn,
3863                                                 &element_type))
3864                 break;
3865               scale = new_scale;
3866               off = op0;
3867               continue;
3868             }
3869           break;
3870         case SSA_NAME:
3871           off = op0;
3872           continue;
3873         CASE_CONVERT:
3874           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3875               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3876             break;
3877           if (TYPE_PRECISION (TREE_TYPE (op0))
3878               == TYPE_PRECISION (TREE_TYPE (off)))
3879             {
3880               off = op0;
3881               continue;
3882             }
3883
3884           /* The internal functions need the offset to be the same width
3885              as the elements of VECTYPE.  Don't include operations that
3886              cast the offset from that width to a different width.  */
3887           if (use_ifn_p
3888               && (int_size_in_bytes (TREE_TYPE (vectype))
3889                   == int_size_in_bytes (TREE_TYPE (off))))
3890             break;
3891
3892           if (TYPE_PRECISION (TREE_TYPE (op0))
3893               < TYPE_PRECISION (TREE_TYPE (off)))
3894             {
3895               off = op0;
3896               offtype = TREE_TYPE (off);
3897               STRIP_NOPS (off);
3898               continue;
3899             }
3900           break;
3901         default:
3902           break;
3903         }
3904       break;
3905     }
3906
3907   /* If at the end OFF still isn't a SSA_NAME or isn't
3908      defined in the loop, punt.  */
3909   if (TREE_CODE (off) != SSA_NAME
3910       || expr_invariant_in_loop_p (loop, off))
3911     return false;
3912
3913   if (offtype == NULL_TREE)
3914     offtype = TREE_TYPE (off);
3915
3916   if (use_ifn_p)
3917     {
3918       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3919                                      memory_type, TYPE_PRECISION (offtype),
3920                                      TYPE_SIGN (offtype), scale, &ifn,
3921                                      &element_type))
3922         return false;
3923     }
3924   else
3925     {
3926       if (DR_IS_READ (dr))
3927         {
3928           if (targetm.vectorize.builtin_gather)
3929             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3930         }
3931       else
3932         {
3933           if (targetm.vectorize.builtin_scatter)
3934             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3935         }
3936
3937       if (!decl)
3938         return false;
3939
3940       ifn = IFN_LAST;
3941       element_type = TREE_TYPE (vectype);
3942     }
3943
3944   info->ifn = ifn;
3945   info->decl = decl;
3946   info->base = base;
3947   info->offset = off;
3948   info->offset_dt = vect_unknown_def_type;
3949   info->offset_vectype = NULL_TREE;
3950   info->scale = scale;
3951   info->element_type = element_type;
3952   info->memory_type = memory_type;
3953   return true;
3954 }
3955
3956 /* Find the data references in STMT, analyze them with respect to LOOP and
3957    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3958    be handled.  */
3959
3960 bool
3961 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3962                                vec<data_reference_p> *datarefs)
3963 {
3964   /* We can ignore clobbers for dataref analysis - they are removed during
3965      loop vectorization and BB vectorization checks dependences with a
3966      stmt walk.  */
3967   if (gimple_clobber_p (stmt))
3968     return true;
3969
3970   if (gimple_has_volatile_ops (stmt))
3971     {
3972       if (dump_enabled_p ())
3973         {
3974           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3975                            "not vectorized: volatile type ");
3976           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3977         }
3978       return false;
3979     }
3980
3981   if (stmt_can_throw_internal (stmt))
3982     {
3983       if (dump_enabled_p ())
3984         {
3985           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3986                            "not vectorized: statement can throw an "
3987                            "exception ");
3988           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3989         }
3990       return false;
3991     }
3992
3993   auto_vec<data_reference_p, 2> refs;
3994   if (!find_data_references_in_stmt (loop, stmt, &refs))
3995     return false;
3996
3997   if (refs.is_empty ())
3998     return true;
3999
4000   if (refs.length () > 1)
4001     {
4002       if (dump_enabled_p ())
4003         {
4004           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4005                            "not vectorized: more than one data ref "
4006                            "in stmt: ");
4007           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4008         }
4009       return false;
4010     }
4011
4012   if (gcall *call = dyn_cast <gcall *> (stmt))
4013     if (!gimple_call_internal_p (call)
4014         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4015             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4016       {
4017         if (dump_enabled_p ())
4018           {
4019             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4020                              "not vectorized: dr in a call ");
4021             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4022           }
4023         return false;
4024       }
4025
4026   data_reference_p dr = refs.pop ();
4027   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4028       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4029     {
4030       if (dump_enabled_p ())
4031         {
4032           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4033                            "not vectorized: statement is bitfield "
4034                            "access ");
4035           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4036         }
4037       return false;
4038     }
4039
4040   if (DR_BASE_ADDRESS (dr)
4041       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4042     {
4043       if (dump_enabled_p ())
4044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4045                          "not vectorized: base addr of dr is a "
4046                          "constant\n");
4047       return false;
4048     }
4049
4050   datarefs->safe_push (dr);
4051   return true;
4052 }
4053
4054 /* Function vect_analyze_data_refs.
4055
4056   Find all the data references in the loop or basic block.
4057
4058    The general structure of the analysis of data refs in the vectorizer is as
4059    follows:
4060    1- vect_analyze_data_refs(loop/bb): call
4061       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4062       in the loop/bb and their dependences.
4063    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4064    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4065    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4066
4067 */
4068
4069 bool
4070 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4071 {
4072   struct loop *loop = NULL;
4073   unsigned int i;
4074   struct data_reference *dr;
4075   tree scalar_type;
4076
4077   if (dump_enabled_p ())
4078     dump_printf_loc (MSG_NOTE, vect_location,
4079                      "=== vect_analyze_data_refs ===\n");
4080
4081   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4082     loop = LOOP_VINFO_LOOP (loop_vinfo);
4083
4084   /* Go through the data-refs, check that the analysis succeeded.  Update
4085      pointer from stmt_vec_info struct to DR and vectype.  */
4086
4087   vec<data_reference_p> datarefs = vinfo->datarefs;
4088   FOR_EACH_VEC_ELT (datarefs, i, dr)
4089     {
4090       gimple *stmt;
4091       stmt_vec_info stmt_info;
4092       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4093       bool simd_lane_access = false;
4094       poly_uint64 vf;
4095
4096       gcc_assert (DR_REF (dr));
4097       stmt = DR_STMT (dr);
4098       stmt_info = vinfo_for_stmt (stmt);
4099
4100       /* Check that analysis of the data-ref succeeded.  */
4101       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4102           || !DR_STEP (dr))
4103         {
4104           bool maybe_gather
4105             = DR_IS_READ (dr)
4106               && !TREE_THIS_VOLATILE (DR_REF (dr))
4107               && (targetm.vectorize.builtin_gather != NULL
4108                   || supports_vec_gather_load_p ());
4109           bool maybe_scatter
4110             = DR_IS_WRITE (dr)
4111               && !TREE_THIS_VOLATILE (DR_REF (dr))
4112               && (targetm.vectorize.builtin_scatter != NULL
4113                   || supports_vec_scatter_store_p ());
4114           bool maybe_simd_lane_access
4115             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4116
4117           /* If target supports vector gather loads or scatter stores, or if
4118              this might be a SIMD lane access, see if they can't be used.  */
4119           if (is_a <loop_vec_info> (vinfo)
4120               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4121               && !nested_in_vect_loop_p (loop, stmt))
4122             {
4123               struct data_reference *newdr
4124                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4125                                    DR_REF (dr), stmt, !maybe_scatter,
4126                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4127               gcc_assert (newdr != NULL && DR_REF (newdr));
4128               if (DR_BASE_ADDRESS (newdr)
4129                   && DR_OFFSET (newdr)
4130                   && DR_INIT (newdr)
4131                   && DR_STEP (newdr)
4132                   && integer_zerop (DR_STEP (newdr)))
4133                 {
4134                   if (maybe_simd_lane_access)
4135                     {
4136                       tree off = DR_OFFSET (newdr);
4137                       STRIP_NOPS (off);
4138                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4139                           && TREE_CODE (off) == MULT_EXPR
4140                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4141                         {
4142                           tree step = TREE_OPERAND (off, 1);
4143                           off = TREE_OPERAND (off, 0);
4144                           STRIP_NOPS (off);
4145                           if (CONVERT_EXPR_P (off)
4146                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4147                                                                           0)))
4148                                  < TYPE_PRECISION (TREE_TYPE (off)))
4149                             off = TREE_OPERAND (off, 0);
4150                           if (TREE_CODE (off) == SSA_NAME)
4151                             {
4152                               gimple *def = SSA_NAME_DEF_STMT (off);
4153                               tree reft = TREE_TYPE (DR_REF (newdr));
4154                               if (is_gimple_call (def)
4155                                   && gimple_call_internal_p (def)
4156                                   && (gimple_call_internal_fn (def)
4157                                       == IFN_GOMP_SIMD_LANE))
4158                                 {
4159                                   tree arg = gimple_call_arg (def, 0);
4160                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4161                                   arg = SSA_NAME_VAR (arg);
4162                                   if (arg == loop->simduid
4163                                       /* For now.  */
4164                                       && tree_int_cst_equal
4165                                            (TYPE_SIZE_UNIT (reft),
4166                                             step))
4167                                     {
4168                                       DR_OFFSET (newdr) = ssize_int (0);
4169                                       DR_STEP (newdr) = step;
4170                                       DR_OFFSET_ALIGNMENT (newdr)
4171                                         = BIGGEST_ALIGNMENT;
4172                                       DR_STEP_ALIGNMENT (newdr)
4173                                         = highest_pow2_factor (step);
4174                                       dr = newdr;
4175                                       simd_lane_access = true;
4176                                     }
4177                                 }
4178                             }
4179                         }
4180                     }
4181                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4182                     {
4183                       dr = newdr;
4184                       if (maybe_gather)
4185                         gatherscatter = GATHER;
4186                       else
4187                         gatherscatter = SCATTER;
4188                     }
4189                 }
4190               if (gatherscatter == SG_NONE && !simd_lane_access)
4191                 free_data_ref (newdr);
4192             }
4193
4194           if (gatherscatter == SG_NONE && !simd_lane_access)
4195             {
4196               if (dump_enabled_p ())
4197                 {
4198                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199                                    "not vectorized: data ref analysis "
4200                                    "failed ");
4201                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4202                 }
4203               if (is_a <bb_vec_info> (vinfo))
4204                 {
4205                   /* In BB vectorization the ref can still participate
4206                      in dependence analysis, we just can't vectorize it.  */
4207                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4208                   continue;
4209                 }
4210               return false;
4211             }
4212         }
4213
4214       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4215           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4216           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4217         {
4218           if (dump_enabled_p ())
4219             {
4220               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4221                                "not vectorized: base object not addressable "
4222                                "for stmt: ");
4223               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4224             }
4225           if (is_a <bb_vec_info> (vinfo))
4226             {
4227               /* In BB vectorization the ref can still participate
4228                  in dependence analysis, we just can't vectorize it.  */
4229               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4230               continue;
4231             }
4232           return false;
4233         }
4234
4235       if (is_a <loop_vec_info> (vinfo)
4236           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4237         {
4238           if (nested_in_vect_loop_p (loop, stmt))
4239             {
4240               if (dump_enabled_p ())
4241                 {
4242                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4243                                    "not vectorized: not suitable for strided "
4244                                    "load ");
4245                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4246                 }
4247               return false;
4248             }
4249           STMT_VINFO_STRIDED_P (stmt_info) = true;
4250         }
4251
4252       /* Update DR field in stmt_vec_info struct.  */
4253
4254       /* If the dataref is in an inner-loop of the loop that is considered for
4255          for vectorization, we also want to analyze the access relative to
4256          the outer-loop (DR contains information only relative to the
4257          inner-most enclosing loop).  We do that by building a reference to the
4258          first location accessed by the inner-loop, and analyze it relative to
4259          the outer-loop.  */
4260       if (loop && nested_in_vect_loop_p (loop, stmt))
4261         {
4262           /* Build a reference to the first location accessed by the
4263              inner loop: *(BASE + INIT + OFFSET).  By construction,
4264              this address must be invariant in the inner loop, so we
4265              can consider it as being used in the outer loop.  */
4266           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4267           tree offset = unshare_expr (DR_OFFSET (dr));
4268           tree init = unshare_expr (DR_INIT (dr));
4269           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4270                                           init, offset);
4271           tree init_addr = fold_build_pointer_plus (base, init_offset);
4272           tree init_ref = build_fold_indirect_ref (init_addr);
4273
4274           if (dump_enabled_p ())
4275             {
4276               dump_printf_loc (MSG_NOTE, vect_location,
4277                                "analyze in outer loop: ");
4278               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4279               dump_printf (MSG_NOTE, "\n");
4280             }
4281
4282           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4283                                      init_ref, loop))
4284             /* dr_analyze_innermost already explained the failure.  */
4285             return false;
4286
4287           if (dump_enabled_p ())
4288             {
4289               dump_printf_loc (MSG_NOTE, vect_location,
4290                                "\touter base_address: ");
4291               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4292                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4293               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4294               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4295                                  STMT_VINFO_DR_OFFSET (stmt_info));
4296               dump_printf (MSG_NOTE,
4297                            "\n\touter constant offset from base address: ");
4298               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4299                                  STMT_VINFO_DR_INIT (stmt_info));
4300               dump_printf (MSG_NOTE, "\n\touter step: ");
4301               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4302                                  STMT_VINFO_DR_STEP (stmt_info));
4303               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4304                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4305               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4306                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4307               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4308                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4309               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4310                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4311             }
4312         }
4313
4314       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4315       STMT_VINFO_DATA_REF (stmt_info) = dr;
4316       if (simd_lane_access)
4317         {
4318           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4319           free_data_ref (datarefs[i]);
4320           datarefs[i] = dr;
4321         }
4322
4323       /* Set vectype for STMT.  */
4324       scalar_type = TREE_TYPE (DR_REF (dr));
4325       STMT_VINFO_VECTYPE (stmt_info)
4326         = get_vectype_for_scalar_type (scalar_type);
4327       if (!STMT_VINFO_VECTYPE (stmt_info))
4328         {
4329           if (dump_enabled_p ())
4330             {
4331               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332                                "not vectorized: no vectype for stmt: ");
4333               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4334               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4335               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4336                                  scalar_type);
4337               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4338             }
4339
4340           if (is_a <bb_vec_info> (vinfo))
4341             {
4342               /* No vector type is fine, the ref can still participate
4343                  in dependence analysis, we just can't vectorize it.  */
4344               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4345               continue;
4346             }
4347
4348           if (gatherscatter != SG_NONE || simd_lane_access)
4349             {
4350               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4351               if (gatherscatter != SG_NONE)
4352                 free_data_ref (dr);
4353             }
4354           return false;
4355         }
4356       else
4357         {
4358           if (dump_enabled_p ())
4359             {
4360               dump_printf_loc (MSG_NOTE, vect_location,
4361                                "got vectype for stmt: ");
4362               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4363               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4364                                  STMT_VINFO_VECTYPE (stmt_info));
4365               dump_printf (MSG_NOTE, "\n");
4366             }
4367         }
4368
4369       /* Adjust the minimal vectorization factor according to the
4370          vector type.  */
4371       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4372       *min_vf = upper_bound (*min_vf, vf);
4373
4374       if (gatherscatter != SG_NONE)
4375         {
4376           gather_scatter_info gs_info;
4377           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4378                                           &gs_info)
4379               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4380             {
4381               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4382               free_data_ref (dr);
4383               if (dump_enabled_p ())
4384                 {
4385                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4386                                    (gatherscatter == GATHER) ?
4387                                    "not vectorized: not suitable for gather "
4388                                    "load " :
4389                                    "not vectorized: not suitable for scatter "
4390                                    "store ");
4391                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4392                 }
4393               return false;
4394             }
4395
4396           free_data_ref (datarefs[i]);
4397           datarefs[i] = dr;
4398           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4399         }
4400     }
4401
4402   /* We used to stop processing and prune the list here.  Verify we no
4403      longer need to.  */
4404   gcc_assert (i == datarefs.length ());
4405
4406   return true;
4407 }
4408
4409
4410 /* Function vect_get_new_vect_var.
4411
4412    Returns a name for a new variable.  The current naming scheme appends the
4413    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4414    the name of vectorizer generated variables, and appends that to NAME if
4415    provided.  */
4416
4417 tree
4418 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4419 {
4420   const char *prefix;
4421   tree new_vect_var;
4422
4423   switch (var_kind)
4424   {
4425   case vect_simple_var:
4426     prefix = "vect";
4427     break;
4428   case vect_scalar_var:
4429     prefix = "stmp";
4430     break;
4431   case vect_mask_var:
4432     prefix = "mask";
4433     break;
4434   case vect_pointer_var:
4435     prefix = "vectp";
4436     break;
4437   default:
4438     gcc_unreachable ();
4439   }
4440
4441   if (name)
4442     {
4443       char* tmp = concat (prefix, "_", name, NULL);
4444       new_vect_var = create_tmp_reg (type, tmp);
4445       free (tmp);
4446     }
4447   else
4448     new_vect_var = create_tmp_reg (type, prefix);
4449
4450   return new_vect_var;
4451 }
4452
4453 /* Like vect_get_new_vect_var but return an SSA name.  */
4454
4455 tree
4456 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4457 {
4458   const char *prefix;
4459   tree new_vect_var;
4460
4461   switch (var_kind)
4462   {
4463   case vect_simple_var:
4464     prefix = "vect";
4465     break;
4466   case vect_scalar_var:
4467     prefix = "stmp";
4468     break;
4469   case vect_pointer_var:
4470     prefix = "vectp";
4471     break;
4472   default:
4473     gcc_unreachable ();
4474   }
4475
4476   if (name)
4477     {
4478       char* tmp = concat (prefix, "_", name, NULL);
4479       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4480       free (tmp);
4481     }
4482   else
4483     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4484
4485   return new_vect_var;
4486 }
4487
4488 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4489
4490 static void
4491 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4492 {
4493   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4494   int misalign = DR_MISALIGNMENT (dr);
4495   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4496     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4497   else
4498     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4499                             DR_TARGET_ALIGNMENT (dr), misalign);
4500 }
4501
4502 /* Function vect_create_addr_base_for_vector_ref.
4503
4504    Create an expression that computes the address of the first memory location
4505    that will be accessed for a data reference.
4506
4507    Input:
4508    STMT: The statement containing the data reference.
4509    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4510    OFFSET: Optional. If supplied, it is be added to the initial address.
4511    LOOP:    Specify relative to which loop-nest should the address be computed.
4512             For example, when the dataref is in an inner-loop nested in an
4513             outer-loop that is now being vectorized, LOOP can be either the
4514             outer-loop, or the inner-loop.  The first memory location accessed
4515             by the following dataref ('in' points to short):
4516
4517                 for (i=0; i<N; i++)
4518                    for (j=0; j<M; j++)
4519                      s += in[i+j]
4520
4521             is as follows:
4522             if LOOP=i_loop:     &in             (relative to i_loop)
4523             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4524    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4525             initial address.  Unlike OFFSET, which is number of elements to
4526             be added, BYTE_OFFSET is measured in bytes.
4527
4528    Output:
4529    1. Return an SSA_NAME whose value is the address of the memory location of
4530       the first vector of the data reference.
4531    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4532       these statement(s) which define the returned SSA_NAME.
4533
4534    FORNOW: We are only handling array accesses with step 1.  */
4535
4536 tree
4537 vect_create_addr_base_for_vector_ref (gimple *stmt,
4538                                       gimple_seq *new_stmt_list,
4539                                       tree offset,
4540                                       tree byte_offset)
4541 {
4542   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4543   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4544   const char *base_name;
4545   tree addr_base;
4546   tree dest;
4547   gimple_seq seq = NULL;
4548   tree vect_ptr_type;
4549   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4550   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4551   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4552
4553   tree data_ref_base = unshare_expr (drb->base_address);
4554   tree base_offset = unshare_expr (drb->offset);
4555   tree init = unshare_expr (drb->init);
4556
4557   if (loop_vinfo)
4558     base_name = get_name (data_ref_base);
4559   else
4560     {
4561       base_offset = ssize_int (0);
4562       init = ssize_int (0);
4563       base_name = get_name (DR_REF (dr));
4564     }
4565
4566   /* Create base_offset */
4567   base_offset = size_binop (PLUS_EXPR,
4568                             fold_convert (sizetype, base_offset),
4569                             fold_convert (sizetype, init));
4570
4571   if (offset)
4572     {
4573       offset = fold_build2 (MULT_EXPR, sizetype,
4574                             fold_convert (sizetype, offset), step);
4575       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4576                                  base_offset, offset);
4577     }
4578   if (byte_offset)
4579     {
4580       byte_offset = fold_convert (sizetype, byte_offset);
4581       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4582                                  base_offset, byte_offset);
4583     }
4584
4585   /* base + base_offset */
4586   if (loop_vinfo)
4587     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4588   else
4589     {
4590       addr_base = build1 (ADDR_EXPR,
4591                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4592                           unshare_expr (DR_REF (dr)));
4593     }
4594
4595   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4596   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4597   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4598   gimple_seq_add_seq (new_stmt_list, seq);
4599
4600   if (DR_PTR_INFO (dr)
4601       && TREE_CODE (addr_base) == SSA_NAME
4602       && !SSA_NAME_PTR_INFO (addr_base))
4603     {
4604       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4605       if (offset || byte_offset)
4606         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4607     }
4608
4609   if (dump_enabled_p ())
4610     {
4611       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4612       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4613       dump_printf (MSG_NOTE, "\n");
4614     }
4615
4616   return addr_base;
4617 }
4618
4619
4620 /* Function vect_create_data_ref_ptr.
4621
4622    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4623    location accessed in the loop by STMT, along with the def-use update
4624    chain to appropriately advance the pointer through the loop iterations.
4625    Also set aliasing information for the pointer.  This pointer is used by
4626    the callers to this function to create a memory reference expression for
4627    vector load/store access.
4628
4629    Input:
4630    1. STMT: a stmt that references memory. Expected to be of the form
4631          GIMPLE_ASSIGN <name, data-ref> or
4632          GIMPLE_ASSIGN <data-ref, name>.
4633    2. AGGR_TYPE: the type of the reference, which should be either a vector
4634         or an array.
4635    3. AT_LOOP: the loop where the vector memref is to be created.
4636    4. OFFSET (optional): an offset to be added to the initial address accessed
4637         by the data-ref in STMT.
4638    5. BSI: location where the new stmts are to be placed if there is no loop
4639    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4640         pointing to the initial address.
4641    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4642         to the initial address accessed by the data-ref in STMT.  This is
4643         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4644         in bytes.
4645    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4646         to the IV during each iteration of the loop.  NULL says to move
4647         by one copy of AGGR_TYPE up or down, depending on the step of the
4648         data reference.
4649
4650    Output:
4651    1. Declare a new ptr to vector_type, and have it point to the base of the
4652       data reference (initial addressed accessed by the data reference).
4653       For example, for vector of type V8HI, the following code is generated:
4654
4655       v8hi *ap;
4656       ap = (v8hi *)initial_address;
4657
4658       if OFFSET is not supplied:
4659          initial_address = &a[init];
4660       if OFFSET is supplied:
4661          initial_address = &a[init + OFFSET];
4662       if BYTE_OFFSET is supplied:
4663          initial_address = &a[init] + BYTE_OFFSET;
4664
4665       Return the initial_address in INITIAL_ADDRESS.
4666
4667    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4668       update the pointer in each iteration of the loop.
4669
4670       Return the increment stmt that updates the pointer in PTR_INCR.
4671
4672    3. Set INV_P to true if the access pattern of the data reference in the
4673       vectorized loop is invariant.  Set it to false otherwise.
4674
4675    4. Return the pointer.  */
4676
4677 tree
4678 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4679                           tree offset, tree *initial_address,
4680                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4681                           bool only_init, bool *inv_p, tree byte_offset,
4682                           tree iv_step)
4683 {
4684   const char *base_name;
4685   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4686   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4687   struct loop *loop = NULL;
4688   bool nested_in_vect_loop = false;
4689   struct loop *containing_loop = NULL;
4690   tree aggr_ptr_type;
4691   tree aggr_ptr;
4692   tree new_temp;
4693   gimple_seq new_stmt_list = NULL;
4694   edge pe = NULL;
4695   basic_block new_bb;
4696   tree aggr_ptr_init;
4697   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4698   tree aptr;
4699   gimple_stmt_iterator incr_gsi;
4700   bool insert_after;
4701   tree indx_before_incr, indx_after_incr;
4702   gimple *incr;
4703   tree step;
4704   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4705
4706   gcc_assert (iv_step != NULL_TREE
4707               || TREE_CODE (aggr_type) == ARRAY_TYPE
4708               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4709
4710   if (loop_vinfo)
4711     {
4712       loop = LOOP_VINFO_LOOP (loop_vinfo);
4713       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4714       containing_loop = (gimple_bb (stmt))->loop_father;
4715       pe = loop_preheader_edge (loop);
4716     }
4717   else
4718     {
4719       gcc_assert (bb_vinfo);
4720       only_init = true;
4721       *ptr_incr = NULL;
4722     }
4723
4724   /* Check the step (evolution) of the load in LOOP, and record
4725      whether it's invariant.  */
4726   step = vect_dr_behavior (dr)->step;
4727   if (integer_zerop (step))
4728     *inv_p = true;
4729   else
4730     *inv_p = false;
4731
4732   /* Create an expression for the first address accessed by this load
4733      in LOOP.  */
4734   base_name = get_name (DR_BASE_ADDRESS (dr));
4735
4736   if (dump_enabled_p ())
4737     {
4738       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4739       dump_printf_loc (MSG_NOTE, vect_location,
4740                        "create %s-pointer variable to type: ",
4741                        get_tree_code_name (TREE_CODE (aggr_type)));
4742       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4743       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4744         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4745       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4746         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4747       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4748         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4749       else
4750         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4751       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4752       dump_printf (MSG_NOTE, "\n");
4753     }
4754
4755   /* (1) Create the new aggregate-pointer variable.
4756      Vector and array types inherit the alias set of their component
4757      type by default so we need to use a ref-all pointer if the data
4758      reference does not conflict with the created aggregated data
4759      reference because it is not addressable.  */
4760   bool need_ref_all = false;
4761   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4762                               get_alias_set (DR_REF (dr))))
4763     need_ref_all = true;
4764   /* Likewise for any of the data references in the stmt group.  */
4765   else if (DR_GROUP_SIZE (stmt_info) > 1)
4766     {
4767       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4768       do
4769         {
4770           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4771           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4772           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4773                                       get_alias_set (DR_REF (sdr))))
4774             {
4775               need_ref_all = true;
4776               break;
4777             }
4778           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4779         }
4780       while (orig_stmt);
4781     }
4782   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4783                                                need_ref_all);
4784   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4785
4786
4787   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4788      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4789      def-use update cycles for the pointer: one relative to the outer-loop
4790      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4791      to the inner-loop (which is the inner-most loop containing the dataref),
4792      and this is done be step (5) below.
4793
4794      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4795      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4796      redundant.  Steps (3),(4) create the following:
4797
4798         vp0 = &base_addr;
4799         LOOP:   vp1 = phi(vp0,vp2)
4800                 ...
4801                 ...
4802                 vp2 = vp1 + step
4803                 goto LOOP
4804
4805      If there is an inner-loop nested in loop, then step (5) will also be
4806      applied, and an additional update in the inner-loop will be created:
4807
4808         vp0 = &base_addr;
4809         LOOP:   vp1 = phi(vp0,vp2)
4810                 ...
4811         inner:     vp3 = phi(vp1,vp4)
4812                    vp4 = vp3 + inner_step
4813                    if () goto inner
4814                 ...
4815                 vp2 = vp1 + step
4816                 if () goto LOOP   */
4817
4818   /* (2) Calculate the initial address of the aggregate-pointer, and set
4819      the aggregate-pointer to point to it before the loop.  */
4820
4821   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4822
4823   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4824                                                    offset, byte_offset);
4825   if (new_stmt_list)
4826     {
4827       if (pe)
4828         {
4829           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4830           gcc_assert (!new_bb);
4831         }
4832       else
4833         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4834     }
4835
4836   *initial_address = new_temp;
4837   aggr_ptr_init = new_temp;
4838
4839   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4840      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4841      inner-loop nested in LOOP (during outer-loop vectorization).  */
4842
4843   /* No update in loop is required.  */
4844   if (only_init && (!loop_vinfo || at_loop == loop))
4845     aptr = aggr_ptr_init;
4846   else
4847     {
4848       if (iv_step == NULL_TREE)
4849         {
4850           /* The step of the aggregate pointer is the type size.  */
4851           iv_step = TYPE_SIZE_UNIT (aggr_type);
4852           /* One exception to the above is when the scalar step of the load in
4853              LOOP is zero. In this case the step here is also zero.  */
4854           if (*inv_p)
4855             iv_step = size_zero_node;
4856           else if (tree_int_cst_sgn (step) == -1)
4857             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4858         }
4859
4860       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4861
4862       create_iv (aggr_ptr_init,
4863                  fold_convert (aggr_ptr_type, iv_step),
4864                  aggr_ptr, loop, &incr_gsi, insert_after,
4865                  &indx_before_incr, &indx_after_incr);
4866       incr = gsi_stmt (incr_gsi);
4867       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4868
4869       /* Copy the points-to information if it exists. */
4870       if (DR_PTR_INFO (dr))
4871         {
4872           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4873           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4874         }
4875       if (ptr_incr)
4876         *ptr_incr = incr;
4877
4878       aptr = indx_before_incr;
4879     }
4880
4881   if (!nested_in_vect_loop || only_init)
4882     return aptr;
4883
4884
4885   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4886      nested in LOOP, if exists.  */
4887
4888   gcc_assert (nested_in_vect_loop);
4889   if (!only_init)
4890     {
4891       standard_iv_increment_position (containing_loop, &incr_gsi,
4892                                       &insert_after);
4893       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4894                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4895                  &indx_after_incr);
4896       incr = gsi_stmt (incr_gsi);
4897       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4898
4899       /* Copy the points-to information if it exists. */
4900       if (DR_PTR_INFO (dr))
4901         {
4902           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4903           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4904         }
4905       if (ptr_incr)
4906         *ptr_incr = incr;
4907
4908       return indx_before_incr;
4909     }
4910   else
4911     gcc_unreachable ();
4912 }
4913
4914
4915 /* Function bump_vector_ptr
4916
4917    Increment a pointer (to a vector type) by vector-size. If requested,
4918    i.e. if PTR-INCR is given, then also connect the new increment stmt
4919    to the existing def-use update-chain of the pointer, by modifying
4920    the PTR_INCR as illustrated below:
4921
4922    The pointer def-use update-chain before this function:
4923                         DATAREF_PTR = phi (p_0, p_2)
4924                         ....
4925         PTR_INCR:       p_2 = DATAREF_PTR + step
4926
4927    The pointer def-use update-chain after this function:
4928                         DATAREF_PTR = phi (p_0, p_2)
4929                         ....
4930                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4931                         ....
4932         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4933
4934    Input:
4935    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4936                  in the loop.
4937    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4938               the loop.  The increment amount across iterations is expected
4939               to be vector_size.
4940    BSI - location where the new update stmt is to be placed.
4941    STMT - the original scalar memory-access stmt that is being vectorized.
4942    BUMP - optional. The offset by which to bump the pointer. If not given,
4943           the offset is assumed to be vector_size.
4944
4945    Output: Return NEW_DATAREF_PTR as illustrated above.
4946
4947 */
4948
4949 tree
4950 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4951                  gimple *stmt, tree bump)
4952 {
4953   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4954   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4955   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4956   tree update = TYPE_SIZE_UNIT (vectype);
4957   gassign *incr_stmt;
4958   ssa_op_iter iter;
4959   use_operand_p use_p;
4960   tree new_dataref_ptr;
4961
4962   if (bump)
4963     update = bump;
4964
4965   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4966     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4967   else
4968     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4969   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4970                                    dataref_ptr, update);
4971   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4972
4973   /* Copy the points-to information if it exists. */
4974   if (DR_PTR_INFO (dr))
4975     {
4976       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4977       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4978     }
4979
4980   if (!ptr_incr)
4981     return new_dataref_ptr;
4982
4983   /* Update the vector-pointer's cross-iteration increment.  */
4984   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4985     {
4986       tree use = USE_FROM_PTR (use_p);
4987
4988       if (use == dataref_ptr)
4989         SET_USE (use_p, new_dataref_ptr);
4990       else
4991         gcc_assert (operand_equal_p (use, update, 0));
4992     }
4993
4994   return new_dataref_ptr;
4995 }
4996
4997
4998 /* Copy memory reference info such as base/clique from the SRC reference
4999    to the DEST MEM_REF.  */
5000
5001 void
5002 vect_copy_ref_info (tree dest, tree src)
5003 {
5004   if (TREE_CODE (dest) != MEM_REF)
5005     return;
5006
5007   tree src_base = src;
5008   while (handled_component_p (src_base))
5009     src_base = TREE_OPERAND (src_base, 0);
5010   if (TREE_CODE (src_base) != MEM_REF
5011       && TREE_CODE (src_base) != TARGET_MEM_REF)
5012     return;
5013
5014   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5015   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5016 }
5017
5018
5019 /* Function vect_create_destination_var.
5020
5021    Create a new temporary of type VECTYPE.  */
5022
5023 tree
5024 vect_create_destination_var (tree scalar_dest, tree vectype)
5025 {
5026   tree vec_dest;
5027   const char *name;
5028   char *new_name;
5029   tree type;
5030   enum vect_var_kind kind;
5031
5032   kind = vectype
5033     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5034     ? vect_mask_var
5035     : vect_simple_var
5036     : vect_scalar_var;
5037   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5038
5039   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5040
5041   name = get_name (scalar_dest);
5042   if (name)
5043     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5044   else
5045     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5046   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5047   free (new_name);
5048
5049   return vec_dest;
5050 }
5051
5052 /* Function vect_grouped_store_supported.
5053
5054    Returns TRUE if interleave high and interleave low permutations
5055    are supported, and FALSE otherwise.  */
5056
5057 bool
5058 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5059 {
5060   machine_mode mode = TYPE_MODE (vectype);
5061
5062   /* vect_permute_store_chain requires the group size to be equal to 3 or
5063      be a power of two.  */
5064   if (count != 3 && exact_log2 (count) == -1)
5065     {
5066       if (dump_enabled_p ())
5067         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5068                          "the size of the group of accesses"
5069                          " is not a power of 2 or not eqaul to 3\n");
5070       return false;
5071     }
5072
5073   /* Check that the permutation is supported.  */
5074   if (VECTOR_MODE_P (mode))
5075     {
5076       unsigned int i;
5077       if (count == 3)
5078         {
5079           unsigned int j0 = 0, j1 = 0, j2 = 0;
5080           unsigned int i, j;
5081
5082           unsigned int nelt;
5083           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5084             {
5085               if (dump_enabled_p ())
5086                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5087                                  "cannot handle groups of 3 stores for"
5088                                  " variable-length vectors\n");
5089               return false;
5090             }
5091
5092           vec_perm_builder sel (nelt, nelt, 1);
5093           sel.quick_grow (nelt);
5094           vec_perm_indices indices;
5095           for (j = 0; j < 3; j++)
5096             {
5097               int nelt0 = ((3 - j) * nelt) % 3;
5098               int nelt1 = ((3 - j) * nelt + 1) % 3;
5099               int nelt2 = ((3 - j) * nelt + 2) % 3;
5100               for (i = 0; i < nelt; i++)
5101                 {
5102                   if (3 * i + nelt0 < nelt)
5103                     sel[3 * i + nelt0] = j0++;
5104                   if (3 * i + nelt1 < nelt)
5105                     sel[3 * i + nelt1] = nelt + j1++;
5106                   if (3 * i + nelt2 < nelt)
5107                     sel[3 * i + nelt2] = 0;
5108                 }
5109               indices.new_vector (sel, 2, nelt);
5110               if (!can_vec_perm_const_p (mode, indices))
5111                 {
5112                   if (dump_enabled_p ())
5113                     dump_printf (MSG_MISSED_OPTIMIZATION,
5114                                  "permutation op not supported by target.\n");
5115                   return false;
5116                 }
5117
5118               for (i = 0; i < nelt; i++)
5119                 {
5120                   if (3 * i + nelt0 < nelt)
5121                     sel[3 * i + nelt0] = 3 * i + nelt0;
5122                   if (3 * i + nelt1 < nelt)
5123                     sel[3 * i + nelt1] = 3 * i + nelt1;
5124                   if (3 * i + nelt2 < nelt)
5125                     sel[3 * i + nelt2] = nelt + j2++;
5126                 }
5127               indices.new_vector (sel, 2, nelt);
5128               if (!can_vec_perm_const_p (mode, indices))
5129                 {
5130                   if (dump_enabled_p ())
5131                     dump_printf (MSG_MISSED_OPTIMIZATION,
5132                                  "permutation op not supported by target.\n");
5133                   return false;
5134                 }
5135             }
5136           return true;
5137         }
5138       else
5139         {
5140           /* If length is not equal to 3 then only power of 2 is supported.  */
5141           gcc_assert (pow2p_hwi (count));
5142           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5143
5144           /* The encoding has 2 interleaved stepped patterns.  */
5145           vec_perm_builder sel (nelt, 2, 3);
5146           sel.quick_grow (6);
5147           for (i = 0; i < 3; i++)
5148             {
5149               sel[i * 2] = i;
5150               sel[i * 2 + 1] = i + nelt;
5151             }
5152           vec_perm_indices indices (sel, 2, nelt);
5153           if (can_vec_perm_const_p (mode, indices))
5154             {
5155               for (i = 0; i < 6; i++)
5156                 sel[i] += exact_div (nelt, 2);
5157               indices.new_vector (sel, 2, nelt);
5158               if (can_vec_perm_const_p (mode, indices))
5159                 return true;
5160             }
5161         }
5162     }
5163
5164   if (dump_enabled_p ())
5165     dump_printf (MSG_MISSED_OPTIMIZATION,
5166                  "permutaion op not supported by target.\n");
5167   return false;
5168 }
5169
5170
5171 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5172    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5173
5174 bool
5175 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5176                             bool masked_p)
5177 {
5178   if (masked_p)
5179     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5180                                          vec_mask_store_lanes_optab,
5181                                          vectype, count);
5182   else
5183     return vect_lanes_optab_supported_p ("vec_store_lanes",
5184                                          vec_store_lanes_optab,
5185                                          vectype, count);
5186 }
5187
5188
5189 /* Function vect_permute_store_chain.
5190
5191    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5192    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5193    the data correctly for the stores.  Return the final references for stores
5194    in RESULT_CHAIN.
5195
5196    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5197    The input is 4 vectors each containing 8 elements.  We assign a number to
5198    each element, the input sequence is:
5199
5200    1st vec:   0  1  2  3  4  5  6  7
5201    2nd vec:   8  9 10 11 12 13 14 15
5202    3rd vec:  16 17 18 19 20 21 22 23
5203    4th vec:  24 25 26 27 28 29 30 31
5204
5205    The output sequence should be:
5206
5207    1st vec:  0  8 16 24  1  9 17 25
5208    2nd vec:  2 10 18 26  3 11 19 27
5209    3rd vec:  4 12 20 28  5 13 21 30
5210    4th vec:  6 14 22 30  7 15 23 31
5211
5212    i.e., we interleave the contents of the four vectors in their order.
5213
5214    We use interleave_high/low instructions to create such output.  The input of
5215    each interleave_high/low operation is two vectors:
5216    1st vec    2nd vec
5217    0 1 2 3    4 5 6 7
5218    the even elements of the result vector are obtained left-to-right from the
5219    high/low elements of the first vector.  The odd elements of the result are
5220    obtained left-to-right from the high/low elements of the second vector.
5221    The output of interleave_high will be:   0 4 1 5
5222    and of interleave_low:                   2 6 3 7
5223
5224
5225    The permutation is done in log LENGTH stages.  In each stage interleave_high
5226    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5227    where the first argument is taken from the first half of DR_CHAIN and the
5228    second argument from it's second half.
5229    In our example,
5230
5231    I1: interleave_high (1st vec, 3rd vec)
5232    I2: interleave_low (1st vec, 3rd vec)
5233    I3: interleave_high (2nd vec, 4th vec)
5234    I4: interleave_low (2nd vec, 4th vec)
5235
5236    The output for the first stage is:
5237
5238    I1:  0 16  1 17  2 18  3 19
5239    I2:  4 20  5 21  6 22  7 23
5240    I3:  8 24  9 25 10 26 11 27
5241    I4: 12 28 13 29 14 30 15 31
5242
5243    The output of the second stage, i.e. the final result is:
5244
5245    I1:  0  8 16 24  1  9 17 25
5246    I2:  2 10 18 26  3 11 19 27
5247    I3:  4 12 20 28  5 13 21 30
5248    I4:  6 14 22 30  7 15 23 31.  */
5249
5250 void
5251 vect_permute_store_chain (vec<tree> dr_chain,
5252                           unsigned int length,
5253                           gimple *stmt,
5254                           gimple_stmt_iterator *gsi,
5255                           vec<tree> *result_chain)
5256 {
5257   tree vect1, vect2, high, low;
5258   gimple *perm_stmt;
5259   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5260   tree perm_mask_low, perm_mask_high;
5261   tree data_ref;
5262   tree perm3_mask_low, perm3_mask_high;
5263   unsigned int i, j, n, log_length = exact_log2 (length);
5264
5265   result_chain->quick_grow (length);
5266   memcpy (result_chain->address (), dr_chain.address (),
5267           length * sizeof (tree));
5268
5269   if (length == 3)
5270     {
5271       /* vect_grouped_store_supported ensures that this is constant.  */
5272       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5273       unsigned int j0 = 0, j1 = 0, j2 = 0;
5274
5275       vec_perm_builder sel (nelt, nelt, 1);
5276       sel.quick_grow (nelt);
5277       vec_perm_indices indices;
5278       for (j = 0; j < 3; j++)
5279         {
5280           int nelt0 = ((3 - j) * nelt) % 3;
5281           int nelt1 = ((3 - j) * nelt + 1) % 3;
5282           int nelt2 = ((3 - j) * nelt + 2) % 3;
5283
5284           for (i = 0; i < nelt; i++)
5285             {
5286               if (3 * i + nelt0 < nelt)
5287                 sel[3 * i + nelt0] = j0++;
5288               if (3 * i + nelt1 < nelt)
5289                 sel[3 * i + nelt1] = nelt + j1++;
5290               if (3 * i + nelt2 < nelt)
5291                 sel[3 * i + nelt2] = 0;
5292             }
5293           indices.new_vector (sel, 2, nelt);
5294           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5295
5296           for (i = 0; i < nelt; i++)
5297             {
5298               if (3 * i + nelt0 < nelt)
5299                 sel[3 * i + nelt0] = 3 * i + nelt0;
5300               if (3 * i + nelt1 < nelt)
5301                 sel[3 * i + nelt1] = 3 * i + nelt1;
5302               if (3 * i + nelt2 < nelt)
5303                 sel[3 * i + nelt2] = nelt + j2++;
5304             }
5305           indices.new_vector (sel, 2, nelt);
5306           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5307
5308           vect1 = dr_chain[0];
5309           vect2 = dr_chain[1];
5310
5311           /* Create interleaving stmt:
5312              low = VEC_PERM_EXPR <vect1, vect2,
5313                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5314                                    j + 2, nelt + j + 2, *, ...}>  */
5315           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5316           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5317                                            vect2, perm3_mask_low);
5318           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5319
5320           vect1 = data_ref;
5321           vect2 = dr_chain[2];
5322           /* Create interleaving stmt:
5323              low = VEC_PERM_EXPR <vect1, vect2,
5324                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5325                                    6, 7, nelt + j + 2, ...}>  */
5326           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5327           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5328                                            vect2, perm3_mask_high);
5329           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5330           (*result_chain)[j] = data_ref;
5331         }
5332     }
5333   else
5334     {
5335       /* If length is not equal to 3 then only power of 2 is supported.  */
5336       gcc_assert (pow2p_hwi (length));
5337
5338       /* The encoding has 2 interleaved stepped patterns.  */
5339       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5340       vec_perm_builder sel (nelt, 2, 3);
5341       sel.quick_grow (6);
5342       for (i = 0; i < 3; i++)
5343         {
5344           sel[i * 2] = i;
5345           sel[i * 2 + 1] = i + nelt;
5346         }
5347         vec_perm_indices indices (sel, 2, nelt);
5348         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5349
5350         for (i = 0; i < 6; i++)
5351           sel[i] += exact_div (nelt, 2);
5352         indices.new_vector (sel, 2, nelt);
5353         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5354
5355         for (i = 0, n = log_length; i < n; i++)
5356           {
5357             for (j = 0; j < length/2; j++)
5358               {
5359                 vect1 = dr_chain[j];
5360                 vect2 = dr_chain[j+length/2];
5361
5362                 /* Create interleaving stmt:
5363                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5364                                                         ...}>  */
5365                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5366                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5367                                                  vect2, perm_mask_high);
5368                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5369                 (*result_chain)[2*j] = high;
5370
5371                 /* Create interleaving stmt:
5372                    low = VEC_PERM_EXPR <vect1, vect2,
5373                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5374                                          ...}>  */
5375                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5376                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5377                                                  vect2, perm_mask_low);
5378                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5379                 (*result_chain)[2*j+1] = low;
5380               }
5381             memcpy (dr_chain.address (), result_chain->address (),
5382                     length * sizeof (tree));
5383           }
5384     }
5385 }
5386
5387 /* Function vect_setup_realignment
5388
5389    This function is called when vectorizing an unaligned load using
5390    the dr_explicit_realign[_optimized] scheme.
5391    This function generates the following code at the loop prolog:
5392
5393       p = initial_addr;
5394    x  msq_init = *(floor(p));   # prolog load
5395       realignment_token = call target_builtin;
5396     loop:
5397    x  msq = phi (msq_init, ---)
5398
5399    The stmts marked with x are generated only for the case of
5400    dr_explicit_realign_optimized.
5401
5402    The code above sets up a new (vector) pointer, pointing to the first
5403    location accessed by STMT, and a "floor-aligned" load using that pointer.
5404    It also generates code to compute the "realignment-token" (if the relevant
5405    target hook was defined), and creates a phi-node at the loop-header bb
5406    whose arguments are the result of the prolog-load (created by this
5407    function) and the result of a load that takes place in the loop (to be
5408    created by the caller to this function).
5409
5410    For the case of dr_explicit_realign_optimized:
5411    The caller to this function uses the phi-result (msq) to create the
5412    realignment code inside the loop, and sets up the missing phi argument,
5413    as follows:
5414     loop:
5415       msq = phi (msq_init, lsq)
5416       lsq = *(floor(p'));        # load in loop
5417       result = realign_load (msq, lsq, realignment_token);
5418
5419    For the case of dr_explicit_realign:
5420     loop:
5421       msq = *(floor(p));        # load in loop
5422       p' = p + (VS-1);
5423       lsq = *(floor(p'));       # load in loop
5424       result = realign_load (msq, lsq, realignment_token);
5425
5426    Input:
5427    STMT - (scalar) load stmt to be vectorized. This load accesses
5428           a memory location that may be unaligned.
5429    BSI - place where new code is to be inserted.
5430    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5431                               is used.
5432
5433    Output:
5434    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5435                        target hook, if defined.
5436    Return value - the result of the loop-header phi node.  */
5437
5438 tree
5439 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5440                         tree *realignment_token,
5441                         enum dr_alignment_support alignment_support_scheme,
5442                         tree init_addr,
5443                         struct loop **at_loop)
5444 {
5445   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5446   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5447   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5448   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5449   struct loop *loop = NULL;
5450   edge pe = NULL;
5451   tree scalar_dest = gimple_assign_lhs (stmt);
5452   tree vec_dest;
5453   gimple *inc;
5454   tree ptr;
5455   tree data_ref;
5456   basic_block new_bb;
5457   tree msq_init = NULL_TREE;
5458   tree new_temp;
5459   gphi *phi_stmt;
5460   tree msq = NULL_TREE;
5461   gimple_seq stmts = NULL;
5462   bool inv_p;
5463   bool compute_in_loop = false;
5464   bool nested_in_vect_loop = false;
5465   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5466   struct loop *loop_for_initial_load = NULL;
5467
5468   if (loop_vinfo)
5469     {
5470       loop = LOOP_VINFO_LOOP (loop_vinfo);
5471       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5472     }
5473
5474   gcc_assert (alignment_support_scheme == dr_explicit_realign
5475               || alignment_support_scheme == dr_explicit_realign_optimized);
5476
5477   /* We need to generate three things:
5478      1. the misalignment computation
5479      2. the extra vector load (for the optimized realignment scheme).
5480      3. the phi node for the two vectors from which the realignment is
5481       done (for the optimized realignment scheme).  */
5482
5483   /* 1. Determine where to generate the misalignment computation.
5484
5485      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5486      calculation will be generated by this function, outside the loop (in the
5487      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5488      caller, inside the loop.
5489
5490      Background: If the misalignment remains fixed throughout the iterations of
5491      the loop, then both realignment schemes are applicable, and also the
5492      misalignment computation can be done outside LOOP.  This is because we are
5493      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5494      are a multiple of VS (the Vector Size), and therefore the misalignment in
5495      different vectorized LOOP iterations is always the same.
5496      The problem arises only if the memory access is in an inner-loop nested
5497      inside LOOP, which is now being vectorized using outer-loop vectorization.
5498      This is the only case when the misalignment of the memory access may not
5499      remain fixed throughout the iterations of the inner-loop (as explained in
5500      detail in vect_supportable_dr_alignment).  In this case, not only is the
5501      optimized realignment scheme not applicable, but also the misalignment
5502      computation (and generation of the realignment token that is passed to
5503      REALIGN_LOAD) have to be done inside the loop.
5504
5505      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5506      or not, which in turn determines if the misalignment is computed inside
5507      the inner-loop, or outside LOOP.  */
5508
5509   if (init_addr != NULL_TREE || !loop_vinfo)
5510     {
5511       compute_in_loop = true;
5512       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5513     }
5514
5515
5516   /* 2. Determine where to generate the extra vector load.
5517
5518      For the optimized realignment scheme, instead of generating two vector
5519      loads in each iteration, we generate a single extra vector load in the
5520      preheader of the loop, and in each iteration reuse the result of the
5521      vector load from the previous iteration.  In case the memory access is in
5522      an inner-loop nested inside LOOP, which is now being vectorized using
5523      outer-loop vectorization, we need to determine whether this initial vector
5524      load should be generated at the preheader of the inner-loop, or can be
5525      generated at the preheader of LOOP.  If the memory access has no evolution
5526      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5527      to be generated inside LOOP (in the preheader of the inner-loop).  */
5528
5529   if (nested_in_vect_loop)
5530     {
5531       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5532       bool invariant_in_outerloop =
5533             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5534       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5535     }
5536   else
5537     loop_for_initial_load = loop;
5538   if (at_loop)
5539     *at_loop = loop_for_initial_load;
5540
5541   if (loop_for_initial_load)
5542     pe = loop_preheader_edge (loop_for_initial_load);
5543
5544   /* 3. For the case of the optimized realignment, create the first vector
5545       load at the loop preheader.  */
5546
5547   if (alignment_support_scheme == dr_explicit_realign_optimized)
5548     {
5549       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5550       gassign *new_stmt;
5551
5552       gcc_assert (!compute_in_loop);
5553       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5554       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5555                                       NULL_TREE, &init_addr, NULL, &inc,
5556                                       true, &inv_p);
5557       if (TREE_CODE (ptr) == SSA_NAME)
5558         new_temp = copy_ssa_name (ptr);
5559       else
5560         new_temp = make_ssa_name (TREE_TYPE (ptr));
5561       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5562       new_stmt = gimple_build_assign
5563                    (new_temp, BIT_AND_EXPR, ptr,
5564                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5565       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5566       gcc_assert (!new_bb);
5567       data_ref
5568         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5569                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5570       vect_copy_ref_info (data_ref, DR_REF (dr));
5571       new_stmt = gimple_build_assign (vec_dest, data_ref);
5572       new_temp = make_ssa_name (vec_dest, new_stmt);
5573       gimple_assign_set_lhs (new_stmt, new_temp);
5574       if (pe)
5575         {
5576           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5577           gcc_assert (!new_bb);
5578         }
5579       else
5580          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5581
5582       msq_init = gimple_assign_lhs (new_stmt);
5583     }
5584
5585   /* 4. Create realignment token using a target builtin, if available.
5586       It is done either inside the containing loop, or before LOOP (as
5587       determined above).  */
5588
5589   if (targetm.vectorize.builtin_mask_for_load)
5590     {
5591       gcall *new_stmt;
5592       tree builtin_decl;
5593
5594       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5595       if (!init_addr)
5596         {
5597           /* Generate the INIT_ADDR computation outside LOOP.  */
5598           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5599                                                             NULL_TREE);
5600           if (loop)
5601             {
5602               pe = loop_preheader_edge (loop);
5603               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5604               gcc_assert (!new_bb);
5605             }
5606           else
5607              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5608         }
5609
5610       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5611       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5612       vec_dest =
5613         vect_create_destination_var (scalar_dest,
5614                                      gimple_call_return_type (new_stmt));
5615       new_temp = make_ssa_name (vec_dest, new_stmt);
5616       gimple_call_set_lhs (new_stmt, new_temp);
5617
5618       if (compute_in_loop)
5619         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5620       else
5621         {
5622           /* Generate the misalignment computation outside LOOP.  */
5623           pe = loop_preheader_edge (loop);
5624           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5625           gcc_assert (!new_bb);
5626         }
5627
5628       *realignment_token = gimple_call_lhs (new_stmt);
5629
5630       /* The result of the CALL_EXPR to this builtin is determined from
5631          the value of the parameter and no global variables are touched
5632          which makes the builtin a "const" function.  Requiring the
5633          builtin to have the "const" attribute makes it unnecessary
5634          to call mark_call_clobbered.  */
5635       gcc_assert (TREE_READONLY (builtin_decl));
5636     }
5637
5638   if (alignment_support_scheme == dr_explicit_realign)
5639     return msq;
5640
5641   gcc_assert (!compute_in_loop);
5642   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5643
5644
5645   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5646
5647   pe = loop_preheader_edge (containing_loop);
5648   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5649   msq = make_ssa_name (vec_dest);
5650   phi_stmt = create_phi_node (msq, containing_loop->header);
5651   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5652
5653   return msq;
5654 }
5655
5656
5657 /* Function vect_grouped_load_supported.
5658
5659    COUNT is the size of the load group (the number of statements plus the
5660    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5661    only one statement, with a gap of COUNT - 1.
5662
5663    Returns true if a suitable permute exists.  */
5664
5665 bool
5666 vect_grouped_load_supported (tree vectype, bool single_element_p,
5667                              unsigned HOST_WIDE_INT count)
5668 {
5669   machine_mode mode = TYPE_MODE (vectype);
5670
5671   /* If this is single-element interleaving with an element distance
5672      that leaves unused vector loads around punt - we at least create
5673      very sub-optimal code in that case (and blow up memory,
5674      see PR65518).  */
5675   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5676     {
5677       if (dump_enabled_p ())
5678         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5679                          "single-element interleaving not supported "
5680                          "for not adjacent vector loads\n");
5681       return false;
5682     }
5683
5684   /* vect_permute_load_chain requires the group size to be equal to 3 or
5685      be a power of two.  */
5686   if (count != 3 && exact_log2 (count) == -1)
5687     {
5688       if (dump_enabled_p ())
5689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5690                          "the size of the group of accesses"
5691                          " is not a power of 2 or not equal to 3\n");
5692       return false;
5693     }
5694
5695   /* Check that the permutation is supported.  */
5696   if (VECTOR_MODE_P (mode))
5697     {
5698       unsigned int i, j;
5699       if (count == 3)
5700         {
5701           unsigned int nelt;
5702           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5703             {
5704               if (dump_enabled_p ())
5705                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5706                                  "cannot handle groups of 3 loads for"
5707                                  " variable-length vectors\n");
5708               return false;
5709             }
5710
5711           vec_perm_builder sel (nelt, nelt, 1);
5712           sel.quick_grow (nelt);
5713           vec_perm_indices indices;
5714           unsigned int k;
5715           for (k = 0; k < 3; k++)
5716             {
5717               for (i = 0; i < nelt; i++)
5718                 if (3 * i + k < 2 * nelt)
5719                   sel[i] = 3 * i + k;
5720                 else
5721                   sel[i] = 0;
5722               indices.new_vector (sel, 2, nelt);
5723               if (!can_vec_perm_const_p (mode, indices))
5724                 {
5725                   if (dump_enabled_p ())
5726                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727                                      "shuffle of 3 loads is not supported by"
5728                                      " target\n");
5729                   return false;
5730                 }
5731               for (i = 0, j = 0; i < nelt; i++)
5732                 if (3 * i + k < 2 * nelt)
5733                   sel[i] = i;
5734                 else
5735                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5736               indices.new_vector (sel, 2, nelt);
5737               if (!can_vec_perm_const_p (mode, indices))
5738                 {
5739                   if (dump_enabled_p ())
5740                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5741                                      "shuffle of 3 loads is not supported by"
5742                                      " target\n");
5743                   return false;
5744                 }
5745             }
5746           return true;
5747         }
5748       else
5749         {
5750           /* If length is not equal to 3 then only power of 2 is supported.  */
5751           gcc_assert (pow2p_hwi (count));
5752           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5753
5754           /* The encoding has a single stepped pattern.  */
5755           vec_perm_builder sel (nelt, 1, 3);
5756           sel.quick_grow (3);
5757           for (i = 0; i < 3; i++)
5758             sel[i] = i * 2;
5759           vec_perm_indices indices (sel, 2, nelt);
5760           if (can_vec_perm_const_p (mode, indices))
5761             {
5762               for (i = 0; i < 3; i++)
5763                 sel[i] = i * 2 + 1;
5764               indices.new_vector (sel, 2, nelt);
5765               if (can_vec_perm_const_p (mode, indices))
5766                 return true;
5767             }
5768         }
5769     }
5770
5771   if (dump_enabled_p ())
5772     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5773                      "extract even/odd not supported by target\n");
5774   return false;
5775 }
5776
5777 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5778    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5779
5780 bool
5781 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5782                            bool masked_p)
5783 {
5784   if (masked_p)
5785     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5786                                          vec_mask_load_lanes_optab,
5787                                          vectype, count);
5788   else
5789     return vect_lanes_optab_supported_p ("vec_load_lanes",
5790                                          vec_load_lanes_optab,
5791                                          vectype, count);
5792 }
5793
5794 /* Function vect_permute_load_chain.
5795
5796    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5797    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5798    the input data correctly.  Return the final references for loads in
5799    RESULT_CHAIN.
5800
5801    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5802    The input is 4 vectors each containing 8 elements. We assign a number to each
5803    element, the input sequence is:
5804
5805    1st vec:   0  1  2  3  4  5  6  7
5806    2nd vec:   8  9 10 11 12 13 14 15
5807    3rd vec:  16 17 18 19 20 21 22 23
5808    4th vec:  24 25 26 27 28 29 30 31
5809
5810    The output sequence should be:
5811
5812    1st vec:  0 4  8 12 16 20 24 28
5813    2nd vec:  1 5  9 13 17 21 25 29
5814    3rd vec:  2 6 10 14 18 22 26 30
5815    4th vec:  3 7 11 15 19 23 27 31
5816
5817    i.e., the first output vector should contain the first elements of each
5818    interleaving group, etc.
5819
5820    We use extract_even/odd instructions to create such output.  The input of
5821    each extract_even/odd operation is two vectors
5822    1st vec    2nd vec
5823    0 1 2 3    4 5 6 7
5824
5825    and the output is the vector of extracted even/odd elements.  The output of
5826    extract_even will be:   0 2 4 6
5827    and of extract_odd:     1 3 5 7
5828
5829
5830    The permutation is done in log LENGTH stages.  In each stage extract_even
5831    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5832    their order.  In our example,
5833
5834    E1: extract_even (1st vec, 2nd vec)
5835    E2: extract_odd (1st vec, 2nd vec)
5836    E3: extract_even (3rd vec, 4th vec)
5837    E4: extract_odd (3rd vec, 4th vec)
5838
5839    The output for the first stage will be:
5840
5841    E1:  0  2  4  6  8 10 12 14
5842    E2:  1  3  5  7  9 11 13 15
5843    E3: 16 18 20 22 24 26 28 30
5844    E4: 17 19 21 23 25 27 29 31
5845
5846    In order to proceed and create the correct sequence for the next stage (or
5847    for the correct output, if the second stage is the last one, as in our
5848    example), we first put the output of extract_even operation and then the
5849    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5850    The input for the second stage is:
5851
5852    1st vec (E1):  0  2  4  6  8 10 12 14
5853    2nd vec (E3): 16 18 20 22 24 26 28 30
5854    3rd vec (E2):  1  3  5  7  9 11 13 15
5855    4th vec (E4): 17 19 21 23 25 27 29 31
5856
5857    The output of the second stage:
5858
5859    E1: 0 4  8 12 16 20 24 28
5860    E2: 2 6 10 14 18 22 26 30
5861    E3: 1 5  9 13 17 21 25 29
5862    E4: 3 7 11 15 19 23 27 31
5863
5864    And RESULT_CHAIN after reordering:
5865
5866    1st vec (E1):  0 4  8 12 16 20 24 28
5867    2nd vec (E3):  1 5  9 13 17 21 25 29
5868    3rd vec (E2):  2 6 10 14 18 22 26 30
5869    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5870
5871 static void
5872 vect_permute_load_chain (vec<tree> dr_chain,
5873                          unsigned int length,
5874                          gimple *stmt,
5875                          gimple_stmt_iterator *gsi,
5876                          vec<tree> *result_chain)
5877 {
5878   tree data_ref, first_vect, second_vect;
5879   tree perm_mask_even, perm_mask_odd;
5880   tree perm3_mask_low, perm3_mask_high;
5881   gimple *perm_stmt;
5882   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5883   unsigned int i, j, log_length = exact_log2 (length);
5884
5885   result_chain->quick_grow (length);
5886   memcpy (result_chain->address (), dr_chain.address (),
5887           length * sizeof (tree));
5888
5889   if (length == 3)
5890     {
5891       /* vect_grouped_load_supported ensures that this is constant.  */
5892       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5893       unsigned int k;
5894
5895       vec_perm_builder sel (nelt, nelt, 1);
5896       sel.quick_grow (nelt);
5897       vec_perm_indices indices;
5898       for (k = 0; k < 3; k++)
5899         {
5900           for (i = 0; i < nelt; i++)
5901             if (3 * i + k < 2 * nelt)
5902               sel[i] = 3 * i + k;
5903             else
5904               sel[i] = 0;
5905           indices.new_vector (sel, 2, nelt);
5906           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5907
5908           for (i = 0, j = 0; i < nelt; i++)
5909             if (3 * i + k < 2 * nelt)
5910               sel[i] = i;
5911             else
5912               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5913           indices.new_vector (sel, 2, nelt);
5914           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5915
5916           first_vect = dr_chain[0];
5917           second_vect = dr_chain[1];
5918
5919           /* Create interleaving stmt (low part of):
5920              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5921                                                              ...}>  */
5922           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5923           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5924                                            second_vect, perm3_mask_low);
5925           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5926
5927           /* Create interleaving stmt (high part of):
5928              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5929                                                               ...}>  */
5930           first_vect = data_ref;
5931           second_vect = dr_chain[2];
5932           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5933           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5934                                            second_vect, perm3_mask_high);
5935           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5936           (*result_chain)[k] = data_ref;
5937         }
5938     }
5939   else
5940     {
5941       /* If length is not equal to 3 then only power of 2 is supported.  */
5942       gcc_assert (pow2p_hwi (length));
5943
5944       /* The encoding has a single stepped pattern.  */
5945       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5946       vec_perm_builder sel (nelt, 1, 3);
5947       sel.quick_grow (3);
5948       for (i = 0; i < 3; ++i)
5949         sel[i] = i * 2;
5950       vec_perm_indices indices (sel, 2, nelt);
5951       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5952
5953       for (i = 0; i < 3; ++i)
5954         sel[i] = i * 2 + 1;
5955       indices.new_vector (sel, 2, nelt);
5956       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5957
5958       for (i = 0; i < log_length; i++)
5959         {
5960           for (j = 0; j < length; j += 2)
5961             {
5962               first_vect = dr_chain[j];
5963               second_vect = dr_chain[j+1];
5964
5965               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5966               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5967               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5968                                                first_vect, second_vect,
5969                                                perm_mask_even);
5970               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5971               (*result_chain)[j/2] = data_ref;
5972
5973               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5974               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5975               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5976                                                first_vect, second_vect,
5977                                                perm_mask_odd);
5978               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5979               (*result_chain)[j/2+length/2] = data_ref;
5980             }
5981           memcpy (dr_chain.address (), result_chain->address (),
5982                   length * sizeof (tree));
5983         }
5984     }
5985 }
5986
5987 /* Function vect_shift_permute_load_chain.
5988
5989    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5990    sequence of stmts to reorder the input data accordingly.
5991    Return the final references for loads in RESULT_CHAIN.
5992    Return true if successed, false otherwise.
5993
5994    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5995    The input is 3 vectors each containing 8 elements.  We assign a
5996    number to each element, the input sequence is:
5997
5998    1st vec:   0  1  2  3  4  5  6  7
5999    2nd vec:   8  9 10 11 12 13 14 15
6000    3rd vec:  16 17 18 19 20 21 22 23
6001
6002    The output sequence should be:
6003
6004    1st vec:  0 3 6  9 12 15 18 21
6005    2nd vec:  1 4 7 10 13 16 19 22
6006    3rd vec:  2 5 8 11 14 17 20 23
6007
6008    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6009
6010    First we shuffle all 3 vectors to get correct elements order:
6011
6012    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6013    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6014    3rd vec:  (16 19 22) (17 20 23) (18 21)
6015
6016    Next we unite and shift vector 3 times:
6017
6018    1st step:
6019      shift right by 6 the concatenation of:
6020      "1st vec" and  "2nd vec"
6021        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6022      "2nd vec" and  "3rd vec"
6023        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6024      "3rd vec" and  "1st vec"
6025        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6026                              | New vectors                   |
6027
6028      So that now new vectors are:
6029
6030      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6031      2nd vec:  (10 13) (16 19 22) (17 20 23)
6032      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6033
6034    2nd step:
6035      shift right by 5 the concatenation of:
6036      "1st vec" and  "3rd vec"
6037        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6038      "2nd vec" and  "1st vec"
6039        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6040      "3rd vec" and  "2nd vec"
6041        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6042                           | New vectors                   |
6043
6044      So that now new vectors are:
6045
6046      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6047      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6048      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6049
6050    3rd step:
6051      shift right by 5 the concatenation of:
6052      "1st vec" and  "1st vec"
6053        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6054      shift right by 3 the concatenation of:
6055      "2nd vec" and  "2nd vec"
6056                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6057                           | New vectors                   |
6058
6059      So that now all vectors are READY:
6060      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6061      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6062      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6063
6064    This algorithm is faster than one in vect_permute_load_chain if:
6065      1.  "shift of a concatination" is faster than general permutation.
6066          This is usually so.
6067      2.  The TARGET machine can't execute vector instructions in parallel.
6068          This is because each step of the algorithm depends on previous.
6069          The algorithm in vect_permute_load_chain is much more parallel.
6070
6071    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6072 */
6073
6074 static bool
6075 vect_shift_permute_load_chain (vec<tree> dr_chain,
6076                                unsigned int length,
6077                                gimple *stmt,
6078                                gimple_stmt_iterator *gsi,
6079                                vec<tree> *result_chain)
6080 {
6081   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6082   tree perm2_mask1, perm2_mask2, perm3_mask;
6083   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6084   gimple *perm_stmt;
6085
6086   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6087   unsigned int i;
6088   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6089   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6090
6091   unsigned HOST_WIDE_INT nelt, vf;
6092   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6093       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6094     /* Not supported for variable-length vectors.  */
6095     return false;
6096
6097   vec_perm_builder sel (nelt, nelt, 1);
6098   sel.quick_grow (nelt);
6099
6100   result_chain->quick_grow (length);
6101   memcpy (result_chain->address (), dr_chain.address (),
6102           length * sizeof (tree));
6103
6104   if (pow2p_hwi (length) && vf > 4)
6105     {
6106       unsigned int j, log_length = exact_log2 (length);
6107       for (i = 0; i < nelt / 2; ++i)
6108         sel[i] = i * 2;
6109       for (i = 0; i < nelt / 2; ++i)
6110         sel[nelt / 2 + i] = i * 2 + 1;
6111       vec_perm_indices indices (sel, 2, nelt);
6112       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6113         {
6114           if (dump_enabled_p ())
6115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                              "shuffle of 2 fields structure is not \
6117                               supported by target\n");
6118           return false;
6119         }
6120       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6121
6122       for (i = 0; i < nelt / 2; ++i)
6123         sel[i] = i * 2 + 1;
6124       for (i = 0; i < nelt / 2; ++i)
6125         sel[nelt / 2 + i] = i * 2;
6126       indices.new_vector (sel, 2, nelt);
6127       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6128         {
6129           if (dump_enabled_p ())
6130             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131                              "shuffle of 2 fields structure is not \
6132                               supported by target\n");
6133           return false;
6134         }
6135       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6136
6137       /* Generating permutation constant to shift all elements.
6138          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6139       for (i = 0; i < nelt; i++)
6140         sel[i] = nelt / 2 + i;
6141       indices.new_vector (sel, 2, nelt);
6142       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6143         {
6144           if (dump_enabled_p ())
6145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6146                              "shift permutation is not supported by target\n");
6147           return false;
6148         }
6149       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6150
6151       /* Generating permutation constant to select vector from 2.
6152          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6153       for (i = 0; i < nelt / 2; i++)
6154         sel[i] = i;
6155       for (i = nelt / 2; i < nelt; i++)
6156         sel[i] = nelt + i;
6157       indices.new_vector (sel, 2, nelt);
6158       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6159         {
6160           if (dump_enabled_p ())
6161             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6162                              "select is not supported by target\n");
6163           return false;
6164         }
6165       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6166
6167       for (i = 0; i < log_length; i++)
6168         {
6169           for (j = 0; j < length; j += 2)
6170             {
6171               first_vect = dr_chain[j];
6172               second_vect = dr_chain[j + 1];
6173
6174               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6175               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6176                                                first_vect, first_vect,
6177                                                perm2_mask1);
6178               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6179               vect[0] = data_ref;
6180
6181               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6182               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6183                                                second_vect, second_vect,
6184                                                perm2_mask2);
6185               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6186               vect[1] = data_ref;
6187
6188               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6189               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6190                                                vect[0], vect[1], shift1_mask);
6191               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6192               (*result_chain)[j/2 + length/2] = data_ref;
6193
6194               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6195               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6196                                                vect[0], vect[1], select_mask);
6197               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6198               (*result_chain)[j/2] = data_ref;
6199             }
6200           memcpy (dr_chain.address (), result_chain->address (),
6201                   length * sizeof (tree));
6202         }
6203       return true;
6204     }
6205   if (length == 3 && vf > 2)
6206     {
6207       unsigned int k = 0, l = 0;
6208
6209       /* Generating permutation constant to get all elements in rigth order.
6210          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6211       for (i = 0; i < nelt; i++)
6212         {
6213           if (3 * k + (l % 3) >= nelt)
6214             {
6215               k = 0;
6216               l += (3 - (nelt % 3));
6217             }
6218           sel[i] = 3 * k + (l % 3);
6219           k++;
6220         }
6221       vec_perm_indices indices (sel, 2, nelt);
6222       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6223         {
6224           if (dump_enabled_p ())
6225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226                              "shuffle of 3 fields structure is not \
6227                               supported by target\n");
6228           return false;
6229         }
6230       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6231
6232       /* Generating permutation constant to shift all elements.
6233          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6234       for (i = 0; i < nelt; i++)
6235         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6236       indices.new_vector (sel, 2, nelt);
6237       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6238         {
6239           if (dump_enabled_p ())
6240             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6241                              "shift permutation is not supported by target\n");
6242           return false;
6243         }
6244       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6245
6246       /* Generating permutation constant to shift all elements.
6247          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6248       for (i = 0; i < nelt; i++)
6249         sel[i] = 2 * (nelt / 3) + 1 + i;
6250       indices.new_vector (sel, 2, nelt);
6251       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6252         {
6253           if (dump_enabled_p ())
6254             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6255                              "shift permutation is not supported by target\n");
6256           return false;
6257         }
6258       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6259
6260       /* Generating permutation constant to shift all elements.
6261          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6262       for (i = 0; i < nelt; i++)
6263         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6264       indices.new_vector (sel, 2, nelt);
6265       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6266         {
6267           if (dump_enabled_p ())
6268             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6269                              "shift permutation is not supported by target\n");
6270           return false;
6271         }
6272       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6273
6274       /* Generating permutation constant to shift all elements.
6275          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6276       for (i = 0; i < nelt; i++)
6277         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6278       indices.new_vector (sel, 2, nelt);
6279       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6280         {
6281           if (dump_enabled_p ())
6282             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283                              "shift permutation is not supported by target\n");
6284           return false;
6285         }
6286       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6287
6288       for (k = 0; k < 3; k++)
6289         {
6290           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6291           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6292                                            dr_chain[k], dr_chain[k],
6293                                            perm3_mask);
6294           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6295           vect[k] = data_ref;
6296         }
6297
6298       for (k = 0; k < 3; k++)
6299         {
6300           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6301           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6302                                            vect[k % 3], vect[(k + 1) % 3],
6303                                            shift1_mask);
6304           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6305           vect_shift[k] = data_ref;
6306         }
6307
6308       for (k = 0; k < 3; k++)
6309         {
6310           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6311           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6312                                            vect_shift[(4 - k) % 3],
6313                                            vect_shift[(3 - k) % 3],
6314                                            shift2_mask);
6315           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316           vect[k] = data_ref;
6317         }
6318
6319       (*result_chain)[3 - (nelt % 3)] = vect[2];
6320
6321       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6322       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6323                                        vect[0], shift3_mask);
6324       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6325       (*result_chain)[nelt % 3] = data_ref;
6326
6327       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6328       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6329                                        vect[1], shift4_mask);
6330       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6331       (*result_chain)[0] = data_ref;
6332       return true;
6333     }
6334   return false;
6335 }
6336
6337 /* Function vect_transform_grouped_load.
6338
6339    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6340    to perform their permutation and ascribe the result vectorized statements to
6341    the scalar statements.
6342 */
6343
6344 void
6345 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6346                              gimple_stmt_iterator *gsi)
6347 {
6348   machine_mode mode;
6349   vec<tree> result_chain = vNULL;
6350
6351   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6352      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6353      vectors, that are ready for vector computation.  */
6354   result_chain.create (size);
6355
6356   /* If reassociation width for vector type is 2 or greater target machine can
6357      execute 2 or more vector instructions in parallel.  Otherwise try to
6358      get chain for loads group using vect_shift_permute_load_chain.  */
6359   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6360   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6361       || pow2p_hwi (size)
6362       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6363                                          gsi, &result_chain))
6364     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6365   vect_record_grouped_load_vectors (stmt, result_chain);
6366   result_chain.release ();
6367 }
6368
6369 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6370    generated as part of the vectorization of STMT.  Assign the statement
6371    for each vector to the associated scalar statement.  */
6372
6373 void
6374 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6375 {
6376   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6377   gimple *next_stmt, *new_stmt;
6378   unsigned int i, gap_count;
6379   tree tmp_data_ref;
6380
6381   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6382      Since we scan the chain starting from it's first node, their order
6383      corresponds the order of data-refs in RESULT_CHAIN.  */
6384   next_stmt = first_stmt;
6385   gap_count = 1;
6386   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6387     {
6388       if (!next_stmt)
6389         break;
6390
6391       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6392        code elimination pass later.  No need to check for the first stmt in
6393        the group, since it always exists.
6394        DR_GROUP_GAP is the number of steps in elements from the previous
6395        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6396        correspond to the gaps.  */
6397       if (next_stmt != first_stmt
6398           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6399       {
6400         gap_count++;
6401         continue;
6402       }
6403
6404       while (next_stmt)
6405         {
6406           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6407           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6408              copies, and we put the new vector statement in the first available
6409              RELATED_STMT.  */
6410           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6411             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6412           else
6413             {
6414               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6415                 {
6416                   gimple *prev_stmt =
6417                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6418                   gimple *rel_stmt =
6419                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6420                   while (rel_stmt)
6421                     {
6422                       prev_stmt = rel_stmt;
6423                       rel_stmt =
6424                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6425                     }
6426
6427                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6428                     new_stmt;
6429                 }
6430             }
6431
6432           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6433           gap_count = 1;
6434           /* If NEXT_STMT accesses the same DR as the previous statement,
6435              put the same TMP_DATA_REF as its vectorized statement; otherwise
6436              get the next data-ref from RESULT_CHAIN.  */
6437           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6438             break;
6439         }
6440     }
6441 }
6442
6443 /* Function vect_force_dr_alignment_p.
6444
6445    Returns whether the alignment of a DECL can be forced to be aligned
6446    on ALIGNMENT bit boundary.  */
6447
6448 bool
6449 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6450 {
6451   if (!VAR_P (decl))
6452     return false;
6453
6454   if (decl_in_symtab_p (decl)
6455       && !symtab_node::get (decl)->can_increase_alignment_p ())
6456     return false;
6457
6458   if (TREE_STATIC (decl))
6459     return (alignment <= MAX_OFILE_ALIGNMENT);
6460   else
6461     return (alignment <= MAX_STACK_ALIGNMENT);
6462 }
6463
6464
6465 /* Return whether the data reference DR is supported with respect to its
6466    alignment.
6467    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6468    it is aligned, i.e., check if it is possible to vectorize it with different
6469    alignment.  */
6470
6471 enum dr_alignment_support
6472 vect_supportable_dr_alignment (struct data_reference *dr,
6473                                bool check_aligned_accesses)
6474 {
6475   gimple *stmt = DR_STMT (dr);
6476   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6477   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6478   machine_mode mode = TYPE_MODE (vectype);
6479   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6480   struct loop *vect_loop = NULL;
6481   bool nested_in_vect_loop = false;
6482
6483   if (aligned_access_p (dr) && !check_aligned_accesses)
6484     return dr_aligned;
6485
6486   /* For now assume all conditional loads/stores support unaligned
6487      access without any special code.  */
6488   if (is_gimple_call (stmt)
6489       && gimple_call_internal_p (stmt)
6490       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6491           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6492     return dr_unaligned_supported;
6493
6494   if (loop_vinfo)
6495     {
6496       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6497       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6498     }
6499
6500   /* Possibly unaligned access.  */
6501
6502   /* We can choose between using the implicit realignment scheme (generating
6503      a misaligned_move stmt) and the explicit realignment scheme (generating
6504      aligned loads with a REALIGN_LOAD).  There are two variants to the
6505      explicit realignment scheme: optimized, and unoptimized.
6506      We can optimize the realignment only if the step between consecutive
6507      vector loads is equal to the vector size.  Since the vector memory
6508      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6509      is guaranteed that the misalignment amount remains the same throughout the
6510      execution of the vectorized loop.  Therefore, we can create the
6511      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6512      at the loop preheader.
6513
6514      However, in the case of outer-loop vectorization, when vectorizing a
6515      memory access in the inner-loop nested within the LOOP that is now being
6516      vectorized, while it is guaranteed that the misalignment of the
6517      vectorized memory access will remain the same in different outer-loop
6518      iterations, it is *not* guaranteed that is will remain the same throughout
6519      the execution of the inner-loop.  This is because the inner-loop advances
6520      with the original scalar step (and not in steps of VS).  If the inner-loop
6521      step happens to be a multiple of VS, then the misalignment remains fixed
6522      and we can use the optimized realignment scheme.  For example:
6523
6524       for (i=0; i<N; i++)
6525         for (j=0; j<M; j++)
6526           s += a[i+j];
6527
6528      When vectorizing the i-loop in the above example, the step between
6529      consecutive vector loads is 1, and so the misalignment does not remain
6530      fixed across the execution of the inner-loop, and the realignment cannot
6531      be optimized (as illustrated in the following pseudo vectorized loop):
6532
6533       for (i=0; i<N; i+=4)
6534         for (j=0; j<M; j++){
6535           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6536                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6537                          // (assuming that we start from an aligned address).
6538           }
6539
6540      We therefore have to use the unoptimized realignment scheme:
6541
6542       for (i=0; i<N; i+=4)
6543           for (j=k; j<M; j+=4)
6544           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6545                            // that the misalignment of the initial address is
6546                            // 0).
6547
6548      The loop can then be vectorized as follows:
6549
6550       for (k=0; k<4; k++){
6551         rt = get_realignment_token (&vp[k]);
6552         for (i=0; i<N; i+=4){
6553           v1 = vp[i+k];
6554           for (j=k; j<M; j+=4){
6555             v2 = vp[i+j+VS-1];
6556             va = REALIGN_LOAD <v1,v2,rt>;
6557             vs += va;
6558             v1 = v2;
6559           }
6560         }
6561     } */
6562
6563   if (DR_IS_READ (dr))
6564     {
6565       bool is_packed = false;
6566       tree type = (TREE_TYPE (DR_REF (dr)));
6567
6568       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6569           && (!targetm.vectorize.builtin_mask_for_load
6570               || targetm.vectorize.builtin_mask_for_load ()))
6571         {
6572           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6573
6574           /* If we are doing SLP then the accesses need not have the
6575              same alignment, instead it depends on the SLP group size.  */
6576           if (loop_vinfo
6577               && STMT_SLP_TYPE (stmt_info)
6578               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6579                               * DR_GROUP_SIZE (vinfo_for_stmt
6580                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6581                               TYPE_VECTOR_SUBPARTS (vectype)))
6582             ;
6583           else if (!loop_vinfo
6584                    || (nested_in_vect_loop
6585                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6586                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6587             return dr_explicit_realign;
6588           else
6589             return dr_explicit_realign_optimized;
6590         }
6591       if (!known_alignment_for_access_p (dr))
6592         is_packed = not_size_aligned (DR_REF (dr));
6593
6594       if (targetm.vectorize.support_vector_misalignment
6595             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6596         /* Can't software pipeline the loads, but can at least do them.  */
6597         return dr_unaligned_supported;
6598     }
6599   else
6600     {
6601       bool is_packed = false;
6602       tree type = (TREE_TYPE (DR_REF (dr)));
6603
6604       if (!known_alignment_for_access_p (dr))
6605         is_packed = not_size_aligned (DR_REF (dr));
6606
6607      if (targetm.vectorize.support_vector_misalignment
6608            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6609        return dr_unaligned_supported;
6610     }
6611
6612   /* Unsupported.  */
6613   return dr_unaligned_unsupported;
6614 }