gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 271         }
 272     }
 273   return true;
 274 }
 275
 276
 277 /* Function vect_analyze_data_ref_dependence.
 278
 279    Return TRUE if there (might) exist a dependence between a memory-reference
 280    DRA and a memory-reference DRB.  When versioning for alias may check a
 281    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 282    the data dependence.  */
 283
 284 static bool
 285 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 286                                   loop_vec_info loop_vinfo,
 287                                   unsigned int *max_vf)
 288 {
 289   unsigned int i;
 290   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 291   struct data_reference *dra = DDR_A (ddr);
 292   struct data_reference *drb = DDR_B (ddr);
 293   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
 294   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
 295   lambda_vector dist_v;
 296   unsigned int loop_depth;
 297
 298   /* In loop analysis all data references should be vectorizable.  */
 299   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 300       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 301     gcc_unreachable ();
 302
 303   /* Independent data accesses.  */
 304   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 305     return false;
 306
 307   if (dra == drb
 308       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 309     return false;
 310
 311   /* We do not have to consider dependences between accesses that belong
 312      to the same group, unless the stride could be smaller than the
 313      group size.  */
 314   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 315       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 316           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 317       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 318     return false;
 319
 320   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 321      least two scalar iterations, there is always also a true dependence.
 322      As the vectorizer does not re-order loads and stores we can ignore
 323      the anti-dependence if TBAA can disambiguate both DRs similar to the
 324      case with known negative distance anti-dependences (positive
 325      distance anti-dependences would violate TBAA constraints).  */
 326   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 327        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 328       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 329                                  get_alias_set (DR_REF (drb))))
 330     return false;
 331
 332   /* Unknown data dependence.  */
 333   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 334     {
 335       /* If user asserted safelen consecutive iterations can be
 336          executed concurrently, assume independence.  */
 337       if (loop->safelen >= 2)
 338         {
 339           if ((unsigned int) loop->safelen < *max_vf)
 340             *max_vf = loop->safelen;
 341           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 342           return false;
 343         }
 344
 345       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 346           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 347         {
 348           if (dump_enabled_p ())
 349             {
 350               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 351                                "versioning for alias not supported for: "
 352                                "can't determine dependence between ");
 353               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 354                                  DR_REF (dra));
 355               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 356               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 357                                  DR_REF (drb));
 358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 359             }
 360           return true;
 361         }
 362
 363       if (dump_enabled_p ())
 364         {
 365           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 366                            "versioning for alias required: "
 367                            "can't determine dependence between ");
 368           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 369                              DR_REF (dra));
 370           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 371           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 372                              DR_REF (drb));
 373           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 374         }
 375
 376       /* Add to list of ddrs that need to be tested at run-time.  */
 377       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 378     }
 379
 380   /* Known data dependence.  */
 381   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 382     {
 383       /* If user asserted safelen consecutive iterations can be
 384          executed concurrently, assume independence.  */
 385       if (loop->safelen >= 2)
 386         {
 387           if ((unsigned int) loop->safelen < *max_vf)
 388             *max_vf = loop->safelen;
 389           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 390           return false;
 391         }
 392
 393       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 394           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 395         {
 396           if (dump_enabled_p ())
 397             {
 398               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 399                                "versioning for alias not supported for: "
 400                                "bad dist vector for ");
 401               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 402                                  DR_REF (dra));
 403               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 404               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 405                                  DR_REF (drb));
 406               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 407             }
 408           return true;
 409         }
 410
 411       if (dump_enabled_p ())
 412         {
 413           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 414                            "versioning for alias required: "
 415                            "bad dist vector for ");
 416           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 417           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 418           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 419           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 420         }
 421       /* Add to list of ddrs that need to be tested at run-time.  */
 422       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 423     }
 424
 425   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 426
 427   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 428       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 429                                                 loop_depth, max_vf))
 430     return false;
 431
 432   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 433     {
 434       int dist = dist_v[loop_depth];
 435
 436       if (dump_enabled_p ())
 437         dump_printf_loc (MSG_NOTE, vect_location,
 438                          "dependence distance  = %d.\n", dist);
 439
 440       if (dist == 0)
 441         {
 442           if (dump_enabled_p ())
 443             {
 444               dump_printf_loc (MSG_NOTE, vect_location,
 445                                "dependence distance == 0 between ");
 446               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 447               dump_printf (MSG_NOTE, " and ");
 448               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 449               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 450             }
 451
 452           /* When we perform grouped accesses and perform implicit CSE
 453              by detecting equal accesses and doing disambiguation with
 454              runtime alias tests like for
 455                 .. = a[i];
 456                 .. = a[i+1];
 457                 a[i] = ..;
 458                 a[i+1] = ..;
 459                 *p = ..;
 460                 .. = a[i];
 461                 .. = a[i+1];
 462              where we will end up loading { a[i], a[i+1] } once, make
 463              sure that inserting group loads before the first load and
 464              stores after the last store will do the right thing.
 465              Similar for groups like
 466                 a[i] = ...;
 467                 ... = a[i];
 468                 a[i+1] = ...;
 469              where loads from the group interleave with the store.  */
 470           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 471                                               vect_dr_stmt (drb)))
 472             {
 473               if (dump_enabled_p ())
 474                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                                  "READ_WRITE dependence in interleaving.\n");
 476               return true;
 477             }
 478
 479           if (loop->safelen < 2)
 480             {
 481               tree indicator = dr_zero_step_indicator (dra);
 482               if (TREE_CODE (indicator) != INTEGER_CST)
 483                 vect_check_nonzero_value (loop_vinfo, indicator);
 484               else if (integer_zerop (indicator))
 485                 {
 486                   if (dump_enabled_p ())
 487                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 488                                  "access also has a zero step\n");
 489                   return true;
 490                 }
 491             }
 492           continue;
 493         }
 494
 495       if (dist > 0 && DDR_REVERSED_P (ddr))
 496         {
 497           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 498              reversed (to make distance vector positive), and the actual
 499              distance is negative.  */
 500           if (dump_enabled_p ())
 501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                              "dependence distance negative.\n");
 503           /* Record a negative dependence distance to later limit the
 504              amount of stmt copying / unrolling we can perform.
 505              Only need to handle read-after-write dependence.  */
 506           if (DR_IS_READ (drb)
 507               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 508                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 509             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 510           continue;
 511         }
 512
 513       unsigned int abs_dist = abs (dist);
 514       if (abs_dist >= 2 && abs_dist < *max_vf)
 515         {
 516           /* The dependence distance requires reduction of the maximal
 517              vectorization factor.  */
 518           *max_vf = abs (dist);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "adjusting maximal vectorization factor to %i\n",
 522                              *max_vf);
 523         }
 524
 525       if (abs_dist >= *max_vf)
 526         {
 527           /* Dependence distance does not create dependence, as far as
 528              vectorization is concerned, in this case.  */
 529           if (dump_enabled_p ())
 530             dump_printf_loc (MSG_NOTE, vect_location,
 531                              "dependence distance >= VF.\n");
 532           continue;
 533         }
 534
 535       if (dump_enabled_p ())
 536         {
 537           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 538                        "not vectorized, possible dependence "
 539                        "between data-refs ");
 540           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 541           dump_printf (MSG_NOTE,  " and ");
 542           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 543           dump_printf (MSG_NOTE,  "\n");
 544         }
 545
 546       return true;
 547     }
 548
 549   return false;
 550 }
 551
 552 /* Function vect_analyze_data_ref_dependences.
 553
 554    Examine all the data references in the loop, and make sure there do not
 555    exist any data dependences between them.  Set *MAX_VF according to
 556    the maximum vectorization factor the data dependences allow.  */
 557
 558 bool
 559 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 560                                    unsigned int *max_vf)
 561 {
 562   unsigned int i;
 563   struct data_dependence_relation *ddr;
 564
 565   if (dump_enabled_p ())
 566     dump_printf_loc (MSG_NOTE, vect_location,
 567                      "=== vect_analyze_data_ref_dependences ===\n");
 568
 569   LOOP_VINFO_DDRS (loop_vinfo)
 570     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 571              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 572   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 573   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 574   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 575                                 &LOOP_VINFO_DDRS (loop_vinfo),
 576                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 577     return false;
 578
 579   /* For epilogues we either have no aliases or alias versioning
 580      was applied to original loop.  Therefore we may just get max_vf
 581      using VF of original loop.  */
 582   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 583     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 584   else
 585     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 586       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 587         return false;
 588
 589   return true;
 590 }
 591
 592
 593 /* Function vect_slp_analyze_data_ref_dependence.
 594
 595    Return TRUE if there (might) exist a dependence between a memory-reference
 596    DRA and a memory-reference DRB.  When versioning for alias may check a
 597    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 598    the data dependence.  */
 599
 600 static bool
 601 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 602 {
 603   struct data_reference *dra = DDR_A (ddr);
 604   struct data_reference *drb = DDR_B (ddr);
 605
 606   /* We need to check dependences of statements marked as unvectorizable
 607      as well, they still can prohibit vectorization.  */
 608
 609   /* Independent data accesses.  */
 610   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 611     return false;
 612
 613   if (dra == drb)
 614     return false;
 615
 616   /* Read-read is OK.  */
 617   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 618     return false;
 619
 620   /* If dra and drb are part of the same interleaving chain consider
 621      them independent.  */
 622   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
 623       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
 624           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
 625     return false;
 626
 627   /* Unknown data dependence.  */
 628   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 629     {
 630       if  (dump_enabled_p ())
 631         {
 632           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 633                            "can't determine dependence between ");
 634           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 635           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 636           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 637           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 638         }
 639     }
 640   else if (dump_enabled_p ())
 641     {
 642       dump_printf_loc (MSG_NOTE, vect_location,
 643                        "determined dependence between ");
 644       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 645       dump_printf (MSG_NOTE, " and ");
 646       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 647       dump_printf (MSG_NOTE,  "\n");
 648     }
 649
 650   return true;
 651 }
 652
 653
 654 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 655    contain the vector of scalar stores of this instance if we are
 656    disambiguating the loads.  */
 657
 658 static bool
 659 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 660                                    vec<gimple *> stores, gimple *last_store)
 661 {
 662   /* This walks over all stmts involved in the SLP load/store done
 663      in NODE verifying we can sink them up to the last stmt in the
 664      group.  */
 665   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 666   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 667     {
 668       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 669       if (access == last_access)
 670         continue;
 671       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 672       ao_ref ref;
 673       bool ref_initialized_p = false;
 674       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 675            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 676         {
 677           gimple *stmt = gsi_stmt (gsi);
 678           if (! gimple_vuse (stmt)
 679               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 680             continue;
 681
 682           /* If we couldn't record a (single) data reference for this
 683              stmt we have to resort to the alias oracle.  */
 684           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 685           if (!dr_b)
 686             {
 687               /* We are moving a store or sinking a load - this means
 688                  we cannot use TBAA for disambiguation.  */
 689               if (!ref_initialized_p)
 690                 ao_ref_init (&ref, DR_REF (dr_a));
 691               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 692                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 693                 return false;
 694               continue;
 695             }
 696
 697           bool dependent = false;
 698           /* If we run into a store of this same instance (we've just
 699              marked those) then delay dependence checking until we run
 700              into the last store because this is where it will have
 701              been sunk to (and we verify if we can do that as well).  */
 702           if (gimple_visited_p (stmt))
 703             {
 704               if (stmt != last_store)
 705                 continue;
 706               unsigned i;
 707               gimple *store;
 708               FOR_EACH_VEC_ELT (stores, i, store)
 709                 {
 710                   data_reference *store_dr
 711                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 712                   ddr_p ddr = initialize_data_dependence_relation
 713                                 (dr_a, store_dr, vNULL);
 714                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 715                   free_dependence_relation (ddr);
 716                   if (dependent)
 717                     break;
 718                 }
 719             }
 720           else
 721             {
 722               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 723                                                                dr_b, vNULL);
 724               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 725               free_dependence_relation (ddr);
 726             }
 727           if (dependent)
 728             return false;
 729         }
 730     }
 731   return true;
 732 }
 733
 734
 735 /* Function vect_analyze_data_ref_dependences.
 736
 737    Examine all the data references in the basic-block, and make sure there
 738    do not exist any data dependences between them.  Set *MAX_VF according to
 739    the maximum vectorization factor the data dependences allow.  */
 740
 741 bool
 742 vect_slp_analyze_instance_dependence (slp_instance instance)
 743 {
 744   if (dump_enabled_p ())
 745     dump_printf_loc (MSG_NOTE, vect_location,
 746                      "=== vect_slp_analyze_instance_dependence ===\n");
 747
 748   /* The stores of this instance are at the root of the SLP tree.  */
 749   slp_tree store = SLP_INSTANCE_TREE (instance);
 750   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 751     store = NULL;
 752
 753   /* Verify we can sink stores to the vectorized stmt insert location.  */
 754   gimple *last_store = NULL;
 755   if (store)
 756     {
 757       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 758         return false;
 759
 760       /* Mark stores in this instance and remember the last one.  */
 761       last_store = vect_find_last_scalar_stmt_in_slp (store);
 762       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 763         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 764     }
 765
 766   bool res = true;
 767
 768   /* Verify we can sink loads to the vectorized stmt insert location,
 769      special-casing stores of this instance.  */
 770   slp_tree load;
 771   unsigned int i;
 772   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 773     if (! vect_slp_analyze_node_dependences (instance, load,
 774                                              store
 775                                              ? SLP_TREE_SCALAR_STMTS (store)
 776                                              : vNULL, last_store))
 777       {
 778         res = false;
 779         break;
 780       }
 781
 782   /* Unset the visited flag.  */
 783   if (store)
 784     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 785       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 786
 787   return res;
 788 }
 789
 790 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 791    the statement that contains DRB, which is useful for recording in the
 792    dump file.  */
 793
 794 static void
 795 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 796                             innermost_loop_behavior *drb)
 797 {
 798   bool existed;
 799   innermost_loop_behavior *&entry
 800     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 801   if (!existed || entry->base_alignment < drb->base_alignment)
 802     {
 803       entry = drb;
 804       if (dump_enabled_p ())
 805         {
 806           dump_printf_loc (MSG_NOTE, vect_location,
 807                            "recording new base alignment for ");
 808           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 809           dump_printf (MSG_NOTE, "\n");
 810           dump_printf_loc (MSG_NOTE, vect_location,
 811                            "  alignment:    %d\n", drb->base_alignment);
 812           dump_printf_loc (MSG_NOTE, vect_location,
 813                            "  misalignment: %d\n", drb->base_misalignment);
 814           dump_printf_loc (MSG_NOTE, vect_location,
 815                            "  based on:     ");
 816           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 817         }
 818     }
 819 }
 820
 821 /* If the region we're going to vectorize is reached, all unconditional
 822    data references occur at least once.  We can therefore pool the base
 823    alignment guarantees from each unconditional reference.  Do this by
 824    going through all the data references in VINFO and checking whether
 825    the containing statement makes the reference unconditionally.  If so,
 826    record the alignment of the base address in VINFO so that it can be
 827    used for all other references with the same base.  */
 828
 829 void
 830 vect_record_base_alignments (vec_info *vinfo)
 831 {
 832   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 833   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 834   data_reference *dr;
 835   unsigned int i;
 836   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 837     {
 838       gimple *stmt = vect_dr_stmt (dr);
 839       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 840           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
 841         {
 842           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 843
 844           /* If DR is nested in the loop that is being vectorized, we can also
 845              record the alignment of the base wrt the outer loop.  */
 846           if (loop && nested_in_vect_loop_p (loop, stmt))
 847             {
 848               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 849               vect_record_base_alignment
 850                  (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 851             }
 852         }
 853     }
 854 }
 855
 856 /* Return the target alignment for the vectorized form of DR.  */
 857
 858 static unsigned int
 859 vect_calculate_target_alignment (struct data_reference *dr)
 860 {
 861   gimple *stmt = vect_dr_stmt (dr);
 862   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 863   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 864   return targetm.vectorize.preferred_vector_alignment (vectype);
 865 }
 866
 867 /* Function vect_compute_data_ref_alignment
 868
 869    Compute the misalignment of the data reference DR.
 870
 871    Output:
 872    1. If during the misalignment computation it is found that the data reference
 873       cannot be vectorized then false is returned.
 874    2. DR_MISALIGNMENT (DR) is defined.
 875
 876    FOR NOW: No analysis is actually performed. Misalignment is calculated
 877    only for trivial cases. TODO.  */
 878
 879 static bool
 880 vect_compute_data_ref_alignment (struct data_reference *dr)
 881 {
 882   gimple *stmt = vect_dr_stmt (dr);
 883   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 884   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 885   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 886   struct loop *loop = NULL;
 887   tree ref = DR_REF (dr);
 888   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 889
 890   if (dump_enabled_p ())
 891     dump_printf_loc (MSG_NOTE, vect_location,
 892                      "vect_compute_data_ref_alignment:\n");
 893
 894   if (loop_vinfo)
 895     loop = LOOP_VINFO_LOOP (loop_vinfo);
 896
 897   /* Initialize misalignment to unknown.  */
 898   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 899
 900   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 901   bool step_preserves_misalignment_p;
 902
 903   unsigned HOST_WIDE_INT vector_alignment
 904     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 905   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 906
 907   /* No step for BB vectorization.  */
 908   if (!loop)
 909     {
 910       gcc_assert (integer_zerop (drb->step));
 911       step_preserves_misalignment_p = true;
 912     }
 913
 914   /* In case the dataref is in an inner-loop of the loop that is being
 915      vectorized (LOOP), we use the base and misalignment information
 916      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 917      stays the same throughout the execution of the inner-loop, which is why
 918      we have to check that the stride of the dataref in the inner-loop evenly
 919      divides by the vector alignment.  */
 920   else if (nested_in_vect_loop_p (loop, stmt))
 921     {
 922       step_preserves_misalignment_p
 923         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 924
 925       if (dump_enabled_p ())
 926         {
 927           if (step_preserves_misalignment_p)
 928             dump_printf_loc (MSG_NOTE, vect_location,
 929                              "inner step divides the vector alignment.\n");
 930           else
 931             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                              "inner step doesn't divide the vector"
 933                              " alignment.\n");
 934         }
 935     }
 936
 937   /* Similarly we can only use base and misalignment information relative to
 938      an innermost loop if the misalignment stays the same throughout the
 939      execution of the loop.  As above, this is the case if the stride of
 940      the dataref evenly divides by the alignment.  */
 941   else
 942     {
 943       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 944       step_preserves_misalignment_p
 945         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 946
 947       if (!step_preserves_misalignment_p && dump_enabled_p ())
 948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 949                          "step doesn't divide the vector alignment.\n");
 950     }
 951
 952   unsigned int base_alignment = drb->base_alignment;
 953   unsigned int base_misalignment = drb->base_misalignment;
 954
 955   /* Calculate the maximum of the pooled base address alignment and the
 956      alignment that we can compute for DR itself.  */
 957   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 958   if (entry && base_alignment < (*entry)->base_alignment)
 959     {
 960       base_alignment = (*entry)->base_alignment;
 961       base_misalignment = (*entry)->base_misalignment;
 962     }
 963
 964   if (drb->offset_alignment < vector_alignment
 965       || !step_preserves_misalignment_p
 966       /* We need to know whether the step wrt the vectorized loop is
 967          negative when computing the starting misalignment below.  */
 968       || TREE_CODE (drb->step) != INTEGER_CST)
 969     {
 970       if (dump_enabled_p ())
 971         {
 972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 973                            "Unknown alignment for access: ");
 974           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 975           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 976         }
 977       return true;
 978     }
 979
 980   if (base_alignment < vector_alignment)
 981     {
 982       unsigned int max_alignment;
 983       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 984       if (max_alignment < vector_alignment
 985           || !vect_can_force_dr_alignment_p (base,
 986                                              vector_alignment * BITS_PER_UNIT))
 987         {
 988           if (dump_enabled_p ())
 989             {
 990               dump_printf_loc (MSG_NOTE, vect_location,
 991                                "can't force alignment of ref: ");
 992               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 993               dump_printf (MSG_NOTE, "\n");
 994             }
 995           return true;
 996         }
 997
 998       /* Force the alignment of the decl.
 999          NOTE: This is the only change to the code we make during
1000          the analysis phase, before deciding to vectorize the loop.  */
1001       if (dump_enabled_p ())
1002         {
1003           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1004           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1005           dump_printf (MSG_NOTE, "\n");
1006         }
1007
1008       DR_VECT_AUX (dr)->base_decl = base;
1009       DR_VECT_AUX (dr)->base_misaligned = true;
1010       base_misalignment = 0;
1011     }
1012   poly_int64 misalignment
1013     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1014
1015   /* If this is a backward running DR then first access in the larger
1016      vectype actually is N-1 elements before the address in the DR.
1017      Adjust misalign accordingly.  */
1018   if (tree_int_cst_sgn (drb->step) < 0)
1019     /* PLUS because STEP is negative.  */
1020     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1021                      * TREE_INT_CST_LOW (drb->step));
1022
1023   unsigned int const_misalignment;
1024   if (!known_misalignment (misalignment, vector_alignment,
1025                            &const_misalignment))
1026     {
1027       if (dump_enabled_p ())
1028         {
1029           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1030                            "Non-constant misalignment for access: ");
1031           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1032           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1033         }
1034       return true;
1035     }
1036
1037   SET_DR_MISALIGNMENT (dr, const_misalignment);
1038
1039   if (dump_enabled_p ())
1040     {
1041       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1043       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1044       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1045     }
1046
1047   return true;
1048 }
1049
1050 /* Function vect_update_misalignment_for_peel.
1051    Sets DR's misalignment
1052    - to 0 if it has the same alignment as DR_PEEL,
1053    - to the misalignment computed using NPEEL if DR's salignment is known,
1054    - to -1 (unknown) otherwise.
1055
1056    DR - the data reference whose misalignment is to be adjusted.
1057    DR_PEEL - the data reference whose misalignment is being made
1058              zero in the vector loop by the peel.
1059    NPEEL - the number of iterations in the peel loop if the misalignment
1060            of DR_PEEL is known at compile time.  */
1061
1062 static void
1063 vect_update_misalignment_for_peel (struct data_reference *dr,
1064                                    struct data_reference *dr_peel, int npeel)
1065 {
1066   unsigned int i;
1067   vec<dr_p> same_aligned_drs;
1068   struct data_reference *current_dr;
1069   int dr_size = vect_get_scalar_dr_size (dr);
1070   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1071   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1072   stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1073
1074  /* For interleaved data accesses the step in the loop must be multiplied by
1075      the size of the interleaving group.  */
1076   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1077     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1078   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1079     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1080
1081   /* It can be assumed that the data refs with the same alignment as dr_peel
1082      are aligned in the vector loop.  */
1083   same_aligned_drs
1084     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1085   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1086     {
1087       if (current_dr != dr)
1088         continue;
1089       gcc_assert (!known_alignment_for_access_p (dr)
1090                   || !known_alignment_for_access_p (dr_peel)
1091                   || (DR_MISALIGNMENT (dr) / dr_size
1092                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1093       SET_DR_MISALIGNMENT (dr, 0);
1094       return;
1095     }
1096
1097   if (known_alignment_for_access_p (dr)
1098       && known_alignment_for_access_p (dr_peel))
1099     {
1100       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1101       int misal = DR_MISALIGNMENT (dr);
1102       misal += negative ? -npeel * dr_size : npeel * dr_size;
1103       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1104       SET_DR_MISALIGNMENT (dr, misal);
1105       return;
1106     }
1107
1108   if (dump_enabled_p ())
1109     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1110                      "to unknown (-1).\n");
1111   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1112 }
1113
1114
1115 /* Function verify_data_ref_alignment
1116
1117    Return TRUE if DR can be handled with respect to alignment.  */
1118
1119 static bool
1120 verify_data_ref_alignment (data_reference_p dr)
1121 {
1122   enum dr_alignment_support supportable_dr_alignment
1123     = vect_supportable_dr_alignment (dr, false);
1124   if (!supportable_dr_alignment)
1125     {
1126       if (dump_enabled_p ())
1127         {
1128           if (DR_IS_READ (dr))
1129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1130                              "not vectorized: unsupported unaligned load.");
1131           else
1132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1133                              "not vectorized: unsupported unaligned "
1134                              "store.");
1135
1136           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1137                              DR_REF (dr));
1138           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1139         }
1140       return false;
1141     }
1142
1143   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1144     dump_printf_loc (MSG_NOTE, vect_location,
1145                      "Vectorizing an unaligned access.\n");
1146
1147   return true;
1148 }
1149
1150 /* Function vect_verify_datarefs_alignment
1151
1152    Return TRUE if all data references in the loop can be
1153    handled with respect to alignment.  */
1154
1155 bool
1156 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1157 {
1158   vec<data_reference_p> datarefs = vinfo->datarefs;
1159   struct data_reference *dr;
1160   unsigned int i;
1161
1162   FOR_EACH_VEC_ELT (datarefs, i, dr)
1163     {
1164       gimple *stmt = vect_dr_stmt (dr);
1165       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1166
1167       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1168         continue;
1169
1170       /* For interleaving, only the alignment of the first access matters.   */
1171       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1172           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1173         continue;
1174
1175       /* Strided accesses perform only component accesses, alignment is
1176          irrelevant for them.  */
1177       if (STMT_VINFO_STRIDED_P (stmt_info)
1178           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1179         continue;
1180
1181       if (! verify_data_ref_alignment (dr))
1182         return false;
1183     }
1184
1185   return true;
1186 }
1187
1188 /* Given an memory reference EXP return whether its alignment is less
1189    than its size.  */
1190
1191 static bool
1192 not_size_aligned (tree exp)
1193 {
1194   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1195     return true;
1196
1197   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1198           > get_object_alignment (exp));
1199 }
1200
1201 /* Function vector_alignment_reachable_p
1202
1203    Return true if vector alignment for DR is reachable by peeling
1204    a few loop iterations.  Return false otherwise.  */
1205
1206 static bool
1207 vector_alignment_reachable_p (struct data_reference *dr)
1208 {
1209   gimple *stmt = vect_dr_stmt (dr);
1210   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1211   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1212
1213   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1214     {
1215       /* For interleaved access we peel only if number of iterations in
1216          the prolog loop ({VF - misalignment}), is a multiple of the
1217          number of the interleaved accesses.  */
1218       int elem_size, mis_in_elements;
1219
1220       /* FORNOW: handle only known alignment.  */
1221       if (!known_alignment_for_access_p (dr))
1222         return false;
1223
1224       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1225       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1226       elem_size = vector_element_size (vector_size, nelements);
1227       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1228
1229       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1230         return false;
1231     }
1232
1233   /* If misalignment is known at the compile time then allow peeling
1234      only if natural alignment is reachable through peeling.  */
1235   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1236     {
1237       HOST_WIDE_INT elmsize =
1238                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1239       if (dump_enabled_p ())
1240         {
1241           dump_printf_loc (MSG_NOTE, vect_location,
1242                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1243           dump_printf (MSG_NOTE,
1244                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1245         }
1246       if (DR_MISALIGNMENT (dr) % elmsize)
1247         {
1248           if (dump_enabled_p ())
1249             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250                              "data size does not divide the misalignment.\n");
1251           return false;
1252         }
1253     }
1254
1255   if (!known_alignment_for_access_p (dr))
1256     {
1257       tree type = TREE_TYPE (DR_REF (dr));
1258       bool is_packed = not_size_aligned (DR_REF (dr));
1259       if (dump_enabled_p ())
1260         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1261                          "Unknown misalignment, %snaturally aligned\n",
1262                          is_packed ? "not " : "");
1263       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1264     }
1265
1266   return true;
1267 }
1268
1269
1270 /* Calculate the cost of the memory access represented by DR.  */
1271
1272 static void
1273 vect_get_data_access_cost (struct data_reference *dr,
1274                            unsigned int *inside_cost,
1275                            unsigned int *outside_cost,
1276                            stmt_vector_for_cost *body_cost_vec,
1277                            stmt_vector_for_cost *prologue_cost_vec)
1278 {
1279   gimple *stmt = vect_dr_stmt (dr);
1280   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1281   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1282   int ncopies;
1283
1284   if (PURE_SLP_STMT (stmt_info))
1285     ncopies = 1;
1286   else
1287     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1288
1289   if (DR_IS_READ (dr))
1290     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1291                         prologue_cost_vec, body_cost_vec, false);
1292   else
1293     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1294
1295   if (dump_enabled_p ())
1296     dump_printf_loc (MSG_NOTE, vect_location,
1297                      "vect_get_data_access_cost: inside_cost = %d, "
1298                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1299 }
1300
1301
1302 typedef struct _vect_peel_info
1303 {
1304   struct data_reference *dr;
1305   int npeel;
1306   unsigned int count;
1307 } *vect_peel_info;
1308
1309 typedef struct _vect_peel_extended_info
1310 {
1311   struct _vect_peel_info peel_info;
1312   unsigned int inside_cost;
1313   unsigned int outside_cost;
1314 } *vect_peel_extended_info;
1315
1316
1317 /* Peeling hashtable helpers.  */
1318
1319 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1320 {
1321   static inline hashval_t hash (const _vect_peel_info *);
1322   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1323 };
1324
1325 inline hashval_t
1326 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1327 {
1328   return (hashval_t) peel_info->npeel;
1329 }
1330
1331 inline bool
1332 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1333 {
1334   return (a->npeel == b->npeel);
1335 }
1336
1337
1338 /* Insert DR into peeling hash table with NPEEL as key.  */
1339
1340 static void
1341 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1342                           loop_vec_info loop_vinfo, struct data_reference *dr,
1343                           int npeel)
1344 {
1345   struct _vect_peel_info elem, *slot;
1346   _vect_peel_info **new_slot;
1347   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1348
1349   elem.npeel = npeel;
1350   slot = peeling_htab->find (&elem);
1351   if (slot)
1352     slot->count++;
1353   else
1354     {
1355       slot = XNEW (struct _vect_peel_info);
1356       slot->npeel = npeel;
1357       slot->dr = dr;
1358       slot->count = 1;
1359       new_slot = peeling_htab->find_slot (slot, INSERT);
1360       *new_slot = slot;
1361     }
1362
1363   if (!supportable_dr_alignment
1364       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1365     slot->count += VECT_MAX_COST;
1366 }
1367
1368
1369 /* Traverse peeling hash table to find peeling option that aligns maximum
1370    number of data accesses.  */
1371
1372 int
1373 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1374                                      _vect_peel_extended_info *max)
1375 {
1376   vect_peel_info elem = *slot;
1377
1378   if (elem->count > max->peel_info.count
1379       || (elem->count == max->peel_info.count
1380           && max->peel_info.npeel > elem->npeel))
1381     {
1382       max->peel_info.npeel = elem->npeel;
1383       max->peel_info.count = elem->count;
1384       max->peel_info.dr = elem->dr;
1385     }
1386
1387   return 1;
1388 }
1389
1390 /* Get the costs of peeling NPEEL iterations checking data access costs
1391    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1392    misalignment will be zero after peeling.  */
1393
1394 static void
1395 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1396                                 struct data_reference *dr0,
1397                                 unsigned int *inside_cost,
1398                                 unsigned int *outside_cost,
1399                                 stmt_vector_for_cost *body_cost_vec,
1400                                 stmt_vector_for_cost *prologue_cost_vec,
1401                                 unsigned int npeel,
1402                                 bool unknown_misalignment)
1403 {
1404   unsigned i;
1405   data_reference *dr;
1406
1407   FOR_EACH_VEC_ELT (datarefs, i, dr)
1408     {
1409       gimple *stmt = vect_dr_stmt (dr);
1410       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1412         continue;
1413
1414       /* For interleaving, only the alignment of the first access
1415          matters.  */
1416       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1417           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1418         continue;
1419
1420       /* Strided accesses perform only component accesses, alignment is
1421          irrelevant for them.  */
1422       if (STMT_VINFO_STRIDED_P (stmt_info)
1423           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1424         continue;
1425
1426       int save_misalignment;
1427       save_misalignment = DR_MISALIGNMENT (dr);
1428       if (npeel == 0)
1429         ;
1430       else if (unknown_misalignment && dr == dr0)
1431         SET_DR_MISALIGNMENT (dr, 0);
1432       else
1433         vect_update_misalignment_for_peel (dr, dr0, npeel);
1434       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1435                                  body_cost_vec, prologue_cost_vec);
1436       SET_DR_MISALIGNMENT (dr, save_misalignment);
1437     }
1438 }
1439
1440 /* Traverse peeling hash table and calculate cost for each peeling option.
1441    Find the one with the lowest cost.  */
1442
1443 int
1444 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1445                                    _vect_peel_extended_info *min)
1446 {
1447   vect_peel_info elem = *slot;
1448   int dummy;
1449   unsigned int inside_cost = 0, outside_cost = 0;
1450   gimple *stmt = vect_dr_stmt (elem->dr);
1451   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1453   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1454                        epilogue_cost_vec;
1455
1456   prologue_cost_vec.create (2);
1457   body_cost_vec.create (2);
1458   epilogue_cost_vec.create (2);
1459
1460   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1461                                   elem->dr, &inside_cost, &outside_cost,
1462                                   &body_cost_vec, &prologue_cost_vec,
1463                                   elem->npeel, false);
1464
1465   body_cost_vec.release ();
1466
1467   outside_cost += vect_get_known_peeling_cost
1468     (loop_vinfo, elem->npeel, &dummy,
1469      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1470      &prologue_cost_vec, &epilogue_cost_vec);
1471
1472   /* Prologue and epilogue costs are added to the target model later.
1473      These costs depend only on the scalar iteration cost, the
1474      number of peeling iterations finally chosen, and the number of
1475      misaligned statements.  So discard the information found here.  */
1476   prologue_cost_vec.release ();
1477   epilogue_cost_vec.release ();
1478
1479   if (inside_cost < min->inside_cost
1480       || (inside_cost == min->inside_cost
1481           && outside_cost < min->outside_cost))
1482     {
1483       min->inside_cost = inside_cost;
1484       min->outside_cost = outside_cost;
1485       min->peel_info.dr = elem->dr;
1486       min->peel_info.npeel = elem->npeel;
1487       min->peel_info.count = elem->count;
1488     }
1489
1490   return 1;
1491 }
1492
1493
1494 /* Choose best peeling option by traversing peeling hash table and either
1495    choosing an option with the lowest cost (if cost model is enabled) or the
1496    option that aligns as many accesses as possible.  */
1497
1498 static struct _vect_peel_extended_info
1499 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1500                                        loop_vec_info loop_vinfo)
1501 {
1502    struct _vect_peel_extended_info res;
1503
1504    res.peel_info.dr = NULL;
1505
1506    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1507      {
1508        res.inside_cost = INT_MAX;
1509        res.outside_cost = INT_MAX;
1510        peeling_htab->traverse <_vect_peel_extended_info *,
1511                                vect_peeling_hash_get_lowest_cost> (&res);
1512      }
1513    else
1514      {
1515        res.peel_info.count = 0;
1516        peeling_htab->traverse <_vect_peel_extended_info *,
1517                                vect_peeling_hash_get_most_frequent> (&res);
1518        res.inside_cost = 0;
1519        res.outside_cost = 0;
1520      }
1521
1522    return res;
1523 }
1524
1525 /* Return true if the new peeling NPEEL is supported.  */
1526
1527 static bool
1528 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1529                           unsigned npeel)
1530 {
1531   unsigned i;
1532   struct data_reference *dr = NULL;
1533   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1534   gimple *stmt;
1535   stmt_vec_info stmt_info;
1536   enum dr_alignment_support supportable_dr_alignment;
1537
1538   /* Ensure that all data refs can be vectorized after the peel.  */
1539   FOR_EACH_VEC_ELT (datarefs, i, dr)
1540     {
1541       int save_misalignment;
1542
1543       if (dr == dr0)
1544         continue;
1545
1546       stmt = vect_dr_stmt (dr);
1547       stmt_info = vinfo_for_stmt (stmt);
1548       /* For interleaving, only the alignment of the first access
1549          matters.  */
1550       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1551           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1552         continue;
1553
1554       /* Strided accesses perform only component accesses, alignment is
1555          irrelevant for them.  */
1556       if (STMT_VINFO_STRIDED_P (stmt_info)
1557           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1558         continue;
1559
1560       save_misalignment = DR_MISALIGNMENT (dr);
1561       vect_update_misalignment_for_peel (dr, dr0, npeel);
1562       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1563       SET_DR_MISALIGNMENT (dr, save_misalignment);
1564
1565       if (!supportable_dr_alignment)
1566         return false;
1567     }
1568
1569   return true;
1570 }
1571
1572 /* Function vect_enhance_data_refs_alignment
1573
1574    This pass will use loop versioning and loop peeling in order to enhance
1575    the alignment of data references in the loop.
1576
1577    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1578    original loop is to be vectorized.  Any other loops that are created by
1579    the transformations performed in this pass - are not supposed to be
1580    vectorized.  This restriction will be relaxed.
1581
1582    This pass will require a cost model to guide it whether to apply peeling
1583    or versioning or a combination of the two.  For example, the scheme that
1584    intel uses when given a loop with several memory accesses, is as follows:
1585    choose one memory access ('p') which alignment you want to force by doing
1586    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1587    other accesses are not necessarily aligned, or (2) use loop versioning to
1588    generate one loop in which all accesses are aligned, and another loop in
1589    which only 'p' is necessarily aligned.
1590
1591    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1592    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1593    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1594
1595    Devising a cost model is the most critical aspect of this work.  It will
1596    guide us on which access to peel for, whether to use loop versioning, how
1597    many versions to create, etc.  The cost model will probably consist of
1598    generic considerations as well as target specific considerations (on
1599    powerpc for example, misaligned stores are more painful than misaligned
1600    loads).
1601
1602    Here are the general steps involved in alignment enhancements:
1603
1604      -- original loop, before alignment analysis:
1605         for (i=0; i<N; i++){
1606           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1607           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1608         }
1609
1610      -- After vect_compute_data_refs_alignment:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- Possibility 1: we do loop versioning:
1617      if (p is aligned) {
1618         for (i=0; i<N; i++){    # loop 1A
1619           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1620           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1621         }
1622      }
1623      else {
1624         for (i=0; i<N; i++){    # loop 1B
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1627         }
1628      }
1629
1630      -- Possibility 2: we do loop peeling:
1631      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1632         x = q[i];
1633         p[i] = y;
1634      }
1635      for (i = 3; i < N; i++){   # loop 2A
1636         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1637         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1638      }
1639
1640      -- Possibility 3: combination of loop peeling and versioning:
1641      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1642         x = q[i];
1643         p[i] = y;
1644      }
1645      if (p is aligned) {
1646         for (i = 3; i<N; i++){  # loop 3A
1647           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1648           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1649         }
1650      }
1651      else {
1652         for (i = 3; i<N; i++){  # loop 3B
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1655         }
1656      }
1657
1658      These loops are later passed to loop_transform to be vectorized.  The
1659      vectorizer will use the alignment information to guide the transformation
1660      (whether to generate regular loads/stores, or with special handling for
1661      misalignment).  */
1662
1663 bool
1664 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1665 {
1666   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1667   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1668   enum dr_alignment_support supportable_dr_alignment;
1669   struct data_reference *dr0 = NULL, *first_store = NULL;
1670   struct data_reference *dr;
1671   unsigned int i, j;
1672   bool do_peeling = false;
1673   bool do_versioning = false;
1674   bool stat;
1675   gimple *stmt;
1676   stmt_vec_info stmt_info;
1677   unsigned int npeel = 0;
1678   bool one_misalignment_known = false;
1679   bool one_misalignment_unknown = false;
1680   bool one_dr_unsupportable = false;
1681   struct data_reference *unsupportable_dr = NULL;
1682   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1683   unsigned possible_npeel_number = 1;
1684   tree vectype;
1685   unsigned int mis, same_align_drs_max = 0;
1686   hash_table<peel_info_hasher> peeling_htab (1);
1687
1688   if (dump_enabled_p ())
1689     dump_printf_loc (MSG_NOTE, vect_location,
1690                      "=== vect_enhance_data_refs_alignment ===\n");
1691
1692   /* Reset data so we can safely be called multiple times.  */
1693   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1694   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1695
1696   /* While cost model enhancements are expected in the future, the high level
1697      view of the code at this time is as follows:
1698
1699      A) If there is a misaligned access then see if peeling to align
1700         this access can make all data references satisfy
1701         vect_supportable_dr_alignment.  If so, update data structures
1702         as needed and return true.
1703
1704      B) If peeling wasn't possible and there is a data reference with an
1705         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1706         then see if loop versioning checks can be used to make all data
1707         references satisfy vect_supportable_dr_alignment.  If so, update
1708         data structures as needed and return true.
1709
1710      C) If neither peeling nor versioning were successful then return false if
1711         any data reference does not satisfy vect_supportable_dr_alignment.
1712
1713      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1714
1715      Note, Possibility 3 above (which is peeling and versioning together) is not
1716      being done at this time.  */
1717
1718   /* (1) Peeling to force alignment.  */
1719
1720   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1721      Considerations:
1722      + How many accesses will become aligned due to the peeling
1723      - How many accesses will become unaligned due to the peeling,
1724        and the cost of misaligned accesses.
1725      - The cost of peeling (the extra runtime checks, the increase
1726        in code size).  */
1727
1728   FOR_EACH_VEC_ELT (datarefs, i, dr)
1729     {
1730       stmt = vect_dr_stmt (dr);
1731       stmt_info = vinfo_for_stmt (stmt);
1732
1733       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1734         continue;
1735
1736       /* For interleaving, only the alignment of the first access
1737          matters.  */
1738       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1739           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1740         continue;
1741
1742       /* For invariant accesses there is nothing to enhance.  */
1743       if (integer_zerop (DR_STEP (dr)))
1744         continue;
1745
1746       /* Strided accesses perform only component accesses, alignment is
1747          irrelevant for them.  */
1748       if (STMT_VINFO_STRIDED_P (stmt_info)
1749           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1750         continue;
1751
1752       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1753       do_peeling = vector_alignment_reachable_p (dr);
1754       if (do_peeling)
1755         {
1756           if (known_alignment_for_access_p (dr))
1757             {
1758               unsigned int npeel_tmp = 0;
1759               bool negative = tree_int_cst_compare (DR_STEP (dr),
1760                                                     size_zero_node) < 0;
1761
1762               vectype = STMT_VINFO_VECTYPE (stmt_info);
1763               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1764               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1765               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1766               if (DR_MISALIGNMENT (dr) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0 = dr;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   struct loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0 = dr;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr = dr;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (datarefs, dr0,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (datarefs, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0 = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS
1941         (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1942     }
1943
1944   peel_for_unknown_alignment.peel_info.npeel = 0;
1945   peel_for_unknown_alignment.peel_info.dr = dr0;
1946
1947   best_peel = peel_for_unknown_alignment;
1948
1949   peel_for_known_alignment.inside_cost = INT_MAX;
1950   peel_for_known_alignment.outside_cost = INT_MAX;
1951   peel_for_known_alignment.peel_info.count = 0;
1952   peel_for_known_alignment.peel_info.dr = NULL;
1953
1954   if (do_peeling && one_misalignment_known)
1955     {
1956       /* Peeling is possible, but there is no data access that is not supported
1957          unless aligned.  So we try to choose the best possible peeling from
1958          the hash table.  */
1959       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1960         (&peeling_htab, loop_vinfo);
1961     }
1962
1963   /* Compare costs of peeling for known and unknown alignment. */
1964   if (peel_for_known_alignment.peel_info.dr != NULL
1965       && peel_for_unknown_alignment.inside_cost
1966       >= peel_for_known_alignment.inside_cost)
1967     {
1968       best_peel = peel_for_known_alignment;
1969
1970       /* If the best peeling for known alignment has NPEEL == 0, perform no
1971          peeling at all except if there is an unsupportable dr that we can
1972          align.  */
1973       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1974         do_peeling = false;
1975     }
1976
1977   /* If there is an unsupportable data ref, prefer this over all choices so far
1978      since we'd have to discard a chosen peeling except when it accidentally
1979      aligned the unsupportable data ref.  */
1980   if (one_dr_unsupportable)
1981     dr0 = unsupportable_dr;
1982   else if (do_peeling)
1983     {
1984       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1985          TODO: Use nopeel_outside_cost or get rid of it?  */
1986       unsigned nopeel_inside_cost = 0;
1987       unsigned nopeel_outside_cost = 0;
1988
1989       stmt_vector_for_cost dummy;
1990       dummy.create (2);
1991       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1992                                       &nopeel_outside_cost, &dummy, &dummy,
1993                                       0, false);
1994       dummy.release ();
1995
1996       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1997          costs will be recorded.  */
1998       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1999       prologue_cost_vec.create (2);
2000       epilogue_cost_vec.create (2);
2001
2002       int dummy2;
2003       nopeel_outside_cost += vect_get_known_peeling_cost
2004         (loop_vinfo, 0, &dummy2,
2005          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006          &prologue_cost_vec, &epilogue_cost_vec);
2007
2008       prologue_cost_vec.release ();
2009       epilogue_cost_vec.release ();
2010
2011       npeel = best_peel.peel_info.npeel;
2012       dr0 = best_peel.peel_info.dr;
2013
2014       /* If no peeling is not more expensive than the best peeling we
2015          have so far, don't perform any peeling.  */
2016       if (nopeel_inside_cost <= best_peel.inside_cost)
2017         do_peeling = false;
2018     }
2019
2020   if (do_peeling)
2021     {
2022       stmt = vect_dr_stmt (dr0);
2023       stmt_info = vinfo_for_stmt (stmt);
2024       vectype = STMT_VINFO_VECTYPE (stmt_info);
2025
2026       if (known_alignment_for_access_p (dr0))
2027         {
2028           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2029                                                 size_zero_node) < 0;
2030           if (!npeel)
2031             {
2032               /* Since it's known at compile time, compute the number of
2033                  iterations in the peeled loop (the peeling factor) for use in
2034                  updating DR_MISALIGNMENT values.  The peeling factor is the
2035                  vectorization factor minus the misalignment as an element
2036                  count.  */
2037               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2038               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2039               npeel = ((mis & (target_align - 1))
2040                        / vect_get_scalar_dr_size (dr0));
2041             }
2042
2043           /* For interleaved data access every iteration accesses all the
2044              members of the group, therefore we divide the number of iterations
2045              by the group size.  */
2046           stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2047           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2048             npeel /= DR_GROUP_SIZE (stmt_info);
2049
2050           if (dump_enabled_p ())
2051             dump_printf_loc (MSG_NOTE, vect_location,
2052                              "Try peeling by %d\n", npeel);
2053         }
2054
2055       /* Ensure that all datarefs can be vectorized after the peel.  */
2056       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2057         do_peeling = false;
2058
2059       /* Check if all datarefs are supportable and log.  */
2060       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2061         {
2062           stat = vect_verify_datarefs_alignment (loop_vinfo);
2063           if (!stat)
2064             do_peeling = false;
2065           else
2066             return stat;
2067         }
2068
2069       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2070       if (do_peeling)
2071         {
2072           unsigned max_allowed_peel
2073             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2074           if (max_allowed_peel != (unsigned)-1)
2075             {
2076               unsigned max_peel = npeel;
2077               if (max_peel == 0)
2078                 {
2079                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2080                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2081                 }
2082               if (max_peel > max_allowed_peel)
2083                 {
2084                   do_peeling = false;
2085                   if (dump_enabled_p ())
2086                     dump_printf_loc (MSG_NOTE, vect_location,
2087                         "Disable peeling, max peels reached: %d\n", max_peel);
2088                 }
2089             }
2090         }
2091
2092       /* Cost model #2 - if peeling may result in a remaining loop not
2093          iterating enough to be vectorized then do not peel.  Since this
2094          is a cost heuristic rather than a correctness decision, use the
2095          most likely runtime value for variable vectorization factors.  */
2096       if (do_peeling
2097           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2098         {
2099           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2100           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2101           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2102               < assumed_vf + max_peel)
2103             do_peeling = false;
2104         }
2105
2106       if (do_peeling)
2107         {
2108           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2109              If the misalignment of DR_i is identical to that of dr0 then set
2110              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2111              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2112              by the peeling factor times the element size of DR_i (MOD the
2113              vectorization factor times the size).  Otherwise, the
2114              misalignment of DR_i must be set to unknown.  */
2115           FOR_EACH_VEC_ELT (datarefs, i, dr)
2116             if (dr != dr0)
2117               {
2118                 /* Strided accesses perform only component accesses, alignment
2119                    is irrelevant for them.  */
2120                 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2121                 if (STMT_VINFO_STRIDED_P (stmt_info)
2122                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2123                   continue;
2124
2125                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2126               }
2127
2128           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2129           if (npeel)
2130             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2131           else
2132             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2133               = DR_MISALIGNMENT (dr0);
2134           SET_DR_MISALIGNMENT (dr0, 0);
2135           if (dump_enabled_p ())
2136             {
2137               dump_printf_loc (MSG_NOTE, vect_location,
2138                                "Alignment of access forced using peeling.\n");
2139               dump_printf_loc (MSG_NOTE, vect_location,
2140                                "Peeling for alignment will be applied.\n");
2141             }
2142
2143           /* The inside-loop cost will be accounted for in vectorizable_load
2144              and vectorizable_store correctly with adjusted alignments.
2145              Drop the body_cst_vec on the floor here.  */
2146           stat = vect_verify_datarefs_alignment (loop_vinfo);
2147           gcc_assert (stat);
2148           return stat;
2149         }
2150     }
2151
2152   /* (2) Versioning to force alignment.  */
2153
2154   /* Try versioning if:
2155      1) optimize loop for speed
2156      2) there is at least one unsupported misaligned data ref with an unknown
2157         misalignment, and
2158      3) all misaligned data refs with a known misalignment are supported, and
2159      4) the number of runtime alignment checks is within reason.  */
2160
2161   do_versioning =
2162         optimize_loop_nest_for_speed_p (loop)
2163         && (!loop->inner); /* FORNOW */
2164
2165   if (do_versioning)
2166     {
2167       FOR_EACH_VEC_ELT (datarefs, i, dr)
2168         {
2169           stmt = vect_dr_stmt (dr);
2170           stmt_info = vinfo_for_stmt (stmt);
2171
2172           /* For interleaving, only the alignment of the first access
2173              matters.  */
2174           if (aligned_access_p (dr)
2175               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2176                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2177             continue;
2178
2179           if (STMT_VINFO_STRIDED_P (stmt_info))
2180             {
2181               /* Strided loads perform only component accesses, alignment is
2182                  irrelevant for them.  */
2183               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184                 continue;
2185               do_versioning = false;
2186               break;
2187             }
2188
2189           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2190
2191           if (!supportable_dr_alignment)
2192             {
2193               gimple *stmt;
2194               int mask;
2195               tree vectype;
2196
2197               if (known_alignment_for_access_p (dr)
2198                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2199                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2200                 {
2201                   do_versioning = false;
2202                   break;
2203                 }
2204
2205               stmt = vect_dr_stmt (dr);
2206               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2207               gcc_assert (vectype);
2208
2209               /* At present we don't support versioning for alignment
2210                  with variable VF, since there's no guarantee that the
2211                  VF is a power of two.  We could relax this if we added
2212                  a way of enforcing a power-of-two size.  */
2213               unsigned HOST_WIDE_INT size;
2214               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2215                 {
2216                   do_versioning = false;
2217                   break;
2218                 }
2219
2220               /* The rightmost bits of an aligned address must be zeros.
2221                  Construct the mask needed for this test.  For example,
2222                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2223                  mask must be 15 = 0xf. */
2224               mask = size - 1;
2225
2226               /* FORNOW: use the same mask to test all potentially unaligned
2227                  references in the loop.  The vectorizer currently supports
2228                  a single vector size, see the reference to
2229                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2230                  vectorization factor is computed.  */
2231               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2232                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2233               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2234               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2235                       vect_dr_stmt (dr));
2236             }
2237         }
2238
2239       /* Versioning requires at least one misaligned data reference.  */
2240       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2241         do_versioning = false;
2242       else if (!do_versioning)
2243         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2244     }
2245
2246   if (do_versioning)
2247     {
2248       vec<gimple *> may_misalign_stmts
2249         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2250       gimple *stmt;
2251
2252       /* It can now be assumed that the data references in the statements
2253          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2254          of the loop being vectorized.  */
2255       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2256         {
2257           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258           dr = STMT_VINFO_DATA_REF (stmt_info);
2259           SET_DR_MISALIGNMENT (dr, 0);
2260           if (dump_enabled_p ())
2261             dump_printf_loc (MSG_NOTE, vect_location,
2262                              "Alignment of access forced using versioning.\n");
2263         }
2264
2265       if (dump_enabled_p ())
2266         dump_printf_loc (MSG_NOTE, vect_location,
2267                          "Versioning for alignment will be applied.\n");
2268
2269       /* Peeling and versioning can't be done together at this time.  */
2270       gcc_assert (! (do_peeling && do_versioning));
2271
2272       stat = vect_verify_datarefs_alignment (loop_vinfo);
2273       gcc_assert (stat);
2274       return stat;
2275     }
2276
2277   /* This point is reached if neither peeling nor versioning is being done.  */
2278   gcc_assert (! (do_peeling || do_versioning));
2279
2280   stat = vect_verify_datarefs_alignment (loop_vinfo);
2281   return stat;
2282 }
2283
2284
2285 /* Function vect_find_same_alignment_drs.
2286
2287    Update group and alignment relations according to the chosen
2288    vectorization factor.  */
2289
2290 static void
2291 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2292 {
2293   struct data_reference *dra = DDR_A (ddr);
2294   struct data_reference *drb = DDR_B (ddr);
2295   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2296   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2297
2298   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2299     return;
2300
2301   if (dra == drb)
2302     return;
2303
2304   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2305       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2306       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2307     return;
2308
2309   /* Two references with distance zero have the same alignment.  */
2310   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2311                           - wi::to_poly_offset (DR_INIT (drb)));
2312   if (maybe_ne (diff, 0))
2313     {
2314       /* Get the wider of the two alignments.  */
2315       unsigned int align_a = (vect_calculate_target_alignment (dra)
2316                               / BITS_PER_UNIT);
2317       unsigned int align_b = (vect_calculate_target_alignment (drb)
2318                               / BITS_PER_UNIT);
2319       unsigned int max_align = MAX (align_a, align_b);
2320
2321       /* Require the gap to be a multiple of the larger vector alignment.  */
2322       if (!multiple_p (diff, max_align))
2323         return;
2324     }
2325
2326   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2327   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2328   if (dump_enabled_p ())
2329     {
2330       dump_printf_loc (MSG_NOTE, vect_location,
2331                        "accesses have the same alignment: ");
2332       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2333       dump_printf (MSG_NOTE,  " and ");
2334       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2335       dump_printf (MSG_NOTE, "\n");
2336     }
2337 }
2338
2339
2340 /* Function vect_analyze_data_refs_alignment
2341
2342    Analyze the alignment of the data-references in the loop.
2343    Return FALSE if a data reference is found that cannot be vectorized.  */
2344
2345 bool
2346 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2347 {
2348   if (dump_enabled_p ())
2349     dump_printf_loc (MSG_NOTE, vect_location,
2350                      "=== vect_analyze_data_refs_alignment ===\n");
2351
2352   /* Mark groups of data references with same alignment using
2353      data dependence information.  */
2354   vec<ddr_p> ddrs = vinfo->ddrs;
2355   struct data_dependence_relation *ddr;
2356   unsigned int i;
2357
2358   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2359     vect_find_same_alignment_drs (ddr);
2360
2361   vec<data_reference_p> datarefs = vinfo->datarefs;
2362   struct data_reference *dr;
2363
2364   vect_record_base_alignments (vinfo);
2365   FOR_EACH_VEC_ELT (datarefs, i, dr)
2366     {
2367       stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2368       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2369           && !vect_compute_data_ref_alignment (dr))
2370         {
2371           /* Strided accesses perform only component accesses, misalignment
2372              information is irrelevant for them.  */
2373           if (STMT_VINFO_STRIDED_P (stmt_info)
2374               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2375             continue;
2376
2377           if (dump_enabled_p ())
2378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379                              "not vectorized: can't calculate alignment "
2380                              "for data ref.\n");
2381
2382           return false;
2383         }
2384     }
2385
2386   return true;
2387 }
2388
2389
2390 /* Analyze alignment of DRs of stmts in NODE.  */
2391
2392 static bool
2393 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2394 {
2395   /* We vectorize from the first scalar stmt in the node unless
2396      the node is permuted in which case we start from the first
2397      element in the group.  */
2398   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2399   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2400   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2401     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2402
2403   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2404   if (! vect_compute_data_ref_alignment (dr)
2405       /* For creating the data-ref pointer we need alignment of the
2406          first element anyway.  */
2407       || (dr != first_dr
2408           && ! vect_compute_data_ref_alignment (first_dr))
2409       || ! verify_data_ref_alignment (dr))
2410     {
2411       if (dump_enabled_p ())
2412         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413                          "not vectorized: bad data alignment in basic "
2414                          "block.\n");
2415       return false;
2416     }
2417
2418   return true;
2419 }
2420
2421 /* Function vect_slp_analyze_instance_alignment
2422
2423    Analyze the alignment of the data-references in the SLP instance.
2424    Return FALSE if a data reference is found that cannot be vectorized.  */
2425
2426 bool
2427 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2428 {
2429   if (dump_enabled_p ())
2430     dump_printf_loc (MSG_NOTE, vect_location,
2431                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2432
2433   slp_tree node;
2434   unsigned i;
2435   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2436     if (! vect_slp_analyze_and_verify_node_alignment (node))
2437       return false;
2438
2439   node = SLP_INSTANCE_TREE (instance);
2440   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2441       && ! vect_slp_analyze_and_verify_node_alignment
2442              (SLP_INSTANCE_TREE (instance)))
2443     return false;
2444
2445   return true;
2446 }
2447
2448
2449 /* Analyze groups of accesses: check that DR belongs to a group of
2450    accesses of legal size, step, etc.  Detect gaps, single element
2451    interleaving, and other special cases. Set grouped access info.
2452    Collect groups of strided stores for further use in SLP analysis.
2453    Worker for vect_analyze_group_access.  */
2454
2455 static bool
2456 vect_analyze_group_access_1 (struct data_reference *dr)
2457 {
2458   tree step = DR_STEP (dr);
2459   tree scalar_type = TREE_TYPE (DR_REF (dr));
2460   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2461   gimple *stmt = vect_dr_stmt (dr);
2462   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2463   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2464   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2465   HOST_WIDE_INT dr_step = -1;
2466   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2467   bool slp_impossible = false;
2468
2469   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2470      size of the interleaving group (including gaps).  */
2471   if (tree_fits_shwi_p (step))
2472     {
2473       dr_step = tree_to_shwi (step);
2474       /* Check that STEP is a multiple of type size.  Otherwise there is
2475          a non-element-sized gap at the end of the group which we
2476          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2477          ???  As we can handle non-constant step fine here we should
2478          simply remove uses of DR_GROUP_GAP between the last and first
2479          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2480          simply not include that gap.  */
2481       if ((dr_step % type_size) != 0)
2482         {
2483           if (dump_enabled_p ())
2484             {
2485               dump_printf_loc (MSG_NOTE, vect_location,
2486                                "Step ");
2487               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2488               dump_printf (MSG_NOTE,
2489                            " is not a multiple of the element size for ");
2490               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2491               dump_printf (MSG_NOTE, "\n");
2492             }
2493           return false;
2494         }
2495       groupsize = absu_hwi (dr_step) / type_size;
2496     }
2497   else
2498     groupsize = 0;
2499
2500   /* Not consecutive access is possible only if it is a part of interleaving.  */
2501   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2502     {
2503       /* Check if it this DR is a part of interleaving, and is a single
2504          element of the group that is accessed in the loop.  */
2505
2506       /* Gaps are supported only for loads. STEP must be a multiple of the type
2507          size.  */
2508       if (DR_IS_READ (dr)
2509           && (dr_step % type_size) == 0
2510           && groupsize > 0)
2511         {
2512           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2513           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2514           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2515           if (dump_enabled_p ())
2516             {
2517               dump_printf_loc (MSG_NOTE, vect_location,
2518                                "Detected single element interleaving ");
2519               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2520               dump_printf (MSG_NOTE, " step ");
2521               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2522               dump_printf (MSG_NOTE, "\n");
2523             }
2524
2525           return true;
2526         }
2527
2528       if (dump_enabled_p ())
2529         {
2530           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2531                            "not consecutive access ");
2532           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2533         }
2534
2535       if (bb_vinfo)
2536         {
2537           /* Mark the statement as unvectorizable.  */
2538           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2539           return true;
2540         }
2541
2542       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2543       STMT_VINFO_STRIDED_P (stmt_info) = true;
2544       return true;
2545     }
2546
2547   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2548     {
2549       /* First stmt in the interleaving chain. Check the chain.  */
2550       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2551       struct data_reference *data_ref = dr;
2552       unsigned int count = 1;
2553       tree prev_init = DR_INIT (data_ref);
2554       gimple *prev = stmt;
2555       HOST_WIDE_INT diff, gaps = 0;
2556
2557       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2558       while (next)
2559         {
2560           /* Skip same data-refs.  In case that two or more stmts share
2561              data-ref (supported only for loads), we vectorize only the first
2562              stmt, and the rest get their vectorized loads from the first
2563              one.  */
2564           if (!tree_int_cst_compare (DR_INIT (data_ref),
2565                                      DR_INIT (STMT_VINFO_DATA_REF (
2566                                                    vinfo_for_stmt (next)))))
2567             {
2568               if (DR_IS_WRITE (data_ref))
2569                 {
2570                   if (dump_enabled_p ())
2571                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2572                                      "Two store stmts share the same dr.\n");
2573                   return false;
2574                 }
2575
2576               if (dump_enabled_p ())
2577                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2578                                  "Two or more load stmts share the same dr.\n");
2579
2580               /* For load use the same data-ref load.  */
2581               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2582
2583               prev = next;
2584               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2585               continue;
2586             }
2587
2588           prev = next;
2589           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2590
2591           /* All group members have the same STEP by construction.  */
2592           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2593
2594           /* Check that the distance between two accesses is equal to the type
2595              size. Otherwise, we have gaps.  */
2596           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2597                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2598           if (diff != 1)
2599             {
2600               /* FORNOW: SLP of accesses with gaps is not supported.  */
2601               slp_impossible = true;
2602               if (DR_IS_WRITE (data_ref))
2603                 {
2604                   if (dump_enabled_p ())
2605                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2606                                      "interleaved store with gaps\n");
2607                   return false;
2608                 }
2609
2610               gaps += diff - 1;
2611             }
2612
2613           last_accessed_element += diff;
2614
2615           /* Store the gap from the previous member of the group. If there is no
2616              gap in the access, DR_GROUP_GAP is always 1.  */
2617           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2618
2619           prev_init = DR_INIT (data_ref);
2620           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2621           /* Count the number of data-refs in the chain.  */
2622           count++;
2623         }
2624
2625       if (groupsize == 0)
2626         groupsize = count + gaps;
2627
2628       /* This could be UINT_MAX but as we are generating code in a very
2629          inefficient way we have to cap earlier.  See PR78699 for example.  */
2630       if (groupsize > 4096)
2631         {
2632           if (dump_enabled_p ())
2633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2634                              "group is too large\n");
2635           return false;
2636         }
2637
2638       /* Check that the size of the interleaving is equal to count for stores,
2639          i.e., that there are no gaps.  */
2640       if (groupsize != count
2641           && !DR_IS_READ (dr))
2642         {
2643           if (dump_enabled_p ())
2644             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2645                              "interleaved store with gaps\n");
2646           return false;
2647         }
2648
2649       /* If there is a gap after the last load in the group it is the
2650          difference between the groupsize and the last accessed
2651          element.
2652          When there is no gap, this difference should be 0.  */
2653       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2654
2655       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2656       if (dump_enabled_p ())
2657         {
2658           dump_printf_loc (MSG_NOTE, vect_location,
2659                            "Detected interleaving ");
2660           if (DR_IS_READ (dr))
2661             dump_printf (MSG_NOTE, "load ");
2662           else
2663             dump_printf (MSG_NOTE, "store ");
2664           dump_printf (MSG_NOTE, "of size %u starting with ",
2665                        (unsigned)groupsize);
2666           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2667           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2668             dump_printf_loc (MSG_NOTE, vect_location,
2669                              "There is a gap of %u elements after the group\n",
2670                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2671         }
2672
2673       /* SLP: create an SLP data structure for every interleaving group of
2674          stores for further analysis in vect_analyse_slp.  */
2675       if (DR_IS_WRITE (dr) && !slp_impossible)
2676         {
2677           if (loop_vinfo)
2678             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2679           if (bb_vinfo)
2680             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2681         }
2682     }
2683
2684   return true;
2685 }
2686
2687 /* Analyze groups of accesses: check that DR belongs to a group of
2688    accesses of legal size, step, etc.  Detect gaps, single element
2689    interleaving, and other special cases. Set grouped access info.
2690    Collect groups of strided stores for further use in SLP analysis.  */
2691
2692 static bool
2693 vect_analyze_group_access (struct data_reference *dr)
2694 {
2695   if (!vect_analyze_group_access_1 (dr))
2696     {
2697       /* Dissolve the group if present.  */
2698       gimple *next;
2699       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2700       while (stmt)
2701         {
2702           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2703           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2704           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2705           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2706           stmt = next;
2707         }
2708       return false;
2709     }
2710   return true;
2711 }
2712
2713 /* Analyze the access pattern of the data-reference DR.
2714    In case of non-consecutive accesses call vect_analyze_group_access() to
2715    analyze groups of accesses.  */
2716
2717 static bool
2718 vect_analyze_data_ref_access (struct data_reference *dr)
2719 {
2720   tree step = DR_STEP (dr);
2721   tree scalar_type = TREE_TYPE (DR_REF (dr));
2722   gimple *stmt = vect_dr_stmt (dr);
2723   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2724   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2725   struct loop *loop = NULL;
2726
2727   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2728     return true;
2729
2730   if (loop_vinfo)
2731     loop = LOOP_VINFO_LOOP (loop_vinfo);
2732
2733   if (loop_vinfo && !step)
2734     {
2735       if (dump_enabled_p ())
2736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737                          "bad data-ref access in loop\n");
2738       return false;
2739     }
2740
2741   /* Allow loads with zero step in inner-loop vectorization.  */
2742   if (loop_vinfo && integer_zerop (step))
2743     {
2744       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2745       if (!nested_in_vect_loop_p (loop, stmt))
2746         return DR_IS_READ (dr);
2747       /* Allow references with zero step for outer loops marked
2748          with pragma omp simd only - it guarantees absence of
2749          loop-carried dependencies between inner loop iterations.  */
2750       if (loop->safelen < 2)
2751         {
2752           if (dump_enabled_p ())
2753             dump_printf_loc (MSG_NOTE, vect_location,
2754                              "zero step in inner loop of nest\n");
2755           return false;
2756         }
2757     }
2758
2759   if (loop && nested_in_vect_loop_p (loop, stmt))
2760     {
2761       /* Interleaved accesses are not yet supported within outer-loop
2762         vectorization for references in the inner-loop.  */
2763       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2764
2765       /* For the rest of the analysis we use the outer-loop step.  */
2766       step = STMT_VINFO_DR_STEP (stmt_info);
2767       if (integer_zerop (step))
2768         {
2769           if (dump_enabled_p ())
2770             dump_printf_loc (MSG_NOTE, vect_location,
2771                              "zero step in outer loop.\n");
2772           return DR_IS_READ (dr);
2773         }
2774     }
2775
2776   /* Consecutive?  */
2777   if (TREE_CODE (step) == INTEGER_CST)
2778     {
2779       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2780       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2781           || (dr_step < 0
2782               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2783         {
2784           /* Mark that it is not interleaving.  */
2785           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2786           return true;
2787         }
2788     }
2789
2790   if (loop && nested_in_vect_loop_p (loop, stmt))
2791     {
2792       if (dump_enabled_p ())
2793         dump_printf_loc (MSG_NOTE, vect_location,
2794                          "grouped access in outer loop.\n");
2795       return false;
2796     }
2797
2798
2799   /* Assume this is a DR handled by non-constant strided load case.  */
2800   if (TREE_CODE (step) != INTEGER_CST)
2801     return (STMT_VINFO_STRIDED_P (stmt_info)
2802             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2803                 || vect_analyze_group_access (dr)));
2804
2805   /* Not consecutive access - check if it's a part of interleaving group.  */
2806   return vect_analyze_group_access (dr);
2807 }
2808
2809 /* Compare two data-references DRA and DRB to group them into chunks
2810    suitable for grouping.  */
2811
2812 static int
2813 dr_group_sort_cmp (const void *dra_, const void *drb_)
2814 {
2815   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2816   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2817   int cmp;
2818
2819   /* Stabilize sort.  */
2820   if (dra == drb)
2821     return 0;
2822
2823   /* DRs in different loops never belong to the same group.  */
2824   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2825   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2826   if (loopa != loopb)
2827     return loopa->num < loopb->num ? -1 : 1;
2828
2829   /* Ordering of DRs according to base.  */
2830   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2831                                DR_BASE_ADDRESS (drb));
2832   if (cmp != 0)
2833     return cmp;
2834
2835   /* And according to DR_OFFSET.  */
2836   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2837   if (cmp != 0)
2838     return cmp;
2839
2840   /* Put reads before writes.  */
2841   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2842     return DR_IS_READ (dra) ? -1 : 1;
2843
2844   /* Then sort after access size.  */
2845   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2846                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2847   if (cmp != 0)
2848     return cmp;
2849
2850   /* And after step.  */
2851   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2852   if (cmp != 0)
2853     return cmp;
2854
2855   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2856   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2857   if (cmp == 0)
2858     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2859   return cmp;
2860 }
2861
2862 /* If OP is the result of a conversion, return the unconverted value,
2863    otherwise return null.  */
2864
2865 static tree
2866 strip_conversion (tree op)
2867 {
2868   if (TREE_CODE (op) != SSA_NAME)
2869     return NULL_TREE;
2870   gimple *stmt = SSA_NAME_DEF_STMT (op);
2871   if (!is_gimple_assign (stmt)
2872       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2873     return NULL_TREE;
2874   return gimple_assign_rhs1 (stmt);
2875 }
2876
2877 /* Return true if vectorizable_* routines can handle statements STMT1
2878    and STMT2 being in a single group.  */
2879
2880 static bool
2881 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2882 {
2883   if (gimple_assign_single_p (stmt1))
2884     return gimple_assign_single_p (stmt2);
2885
2886   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2887     {
2888       /* Check for two masked loads or two masked stores.  */
2889       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2890         return false;
2891       internal_fn ifn = gimple_call_internal_fn (stmt1);
2892       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2893         return false;
2894       if (ifn != gimple_call_internal_fn (stmt2))
2895         return false;
2896
2897       /* Check that the masks are the same.  Cope with casts of masks,
2898          like those created by build_mask_conversion.  */
2899       tree mask1 = gimple_call_arg (stmt1, 2);
2900       tree mask2 = gimple_call_arg (stmt2, 2);
2901       if (!operand_equal_p (mask1, mask2, 0))
2902         {
2903           mask1 = strip_conversion (mask1);
2904           if (!mask1)
2905             return false;
2906           mask2 = strip_conversion (mask2);
2907           if (!mask2)
2908             return false;
2909           if (!operand_equal_p (mask1, mask2, 0))
2910             return false;
2911         }
2912       return true;
2913     }
2914
2915   return false;
2916 }
2917
2918 /* Function vect_analyze_data_ref_accesses.
2919
2920    Analyze the access pattern of all the data references in the loop.
2921
2922    FORNOW: the only access pattern that is considered vectorizable is a
2923            simple step 1 (consecutive) access.
2924
2925    FORNOW: handle only arrays and pointer accesses.  */
2926
2927 bool
2928 vect_analyze_data_ref_accesses (vec_info *vinfo)
2929 {
2930   unsigned int i;
2931   vec<data_reference_p> datarefs = vinfo->datarefs;
2932   struct data_reference *dr;
2933
2934   if (dump_enabled_p ())
2935     dump_printf_loc (MSG_NOTE, vect_location,
2936                      "=== vect_analyze_data_ref_accesses ===\n");
2937
2938   if (datarefs.is_empty ())
2939     return true;
2940
2941   /* Sort the array of datarefs to make building the interleaving chains
2942      linear.  Don't modify the original vector's order, it is needed for
2943      determining what dependencies are reversed.  */
2944   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2945   datarefs_copy.qsort (dr_group_sort_cmp);
2946
2947   /* Build the interleaving chains.  */
2948   for (i = 0; i < datarefs_copy.length () - 1;)
2949     {
2950       data_reference_p dra = datarefs_copy[i];
2951       stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2952       stmt_vec_info lastinfo = NULL;
2953       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2954           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2955         {
2956           ++i;
2957           continue;
2958         }
2959       for (i = i + 1; i < datarefs_copy.length (); ++i)
2960         {
2961           data_reference_p drb = datarefs_copy[i];
2962           stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2963           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2964               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2965             break;
2966
2967           /* ???  Imperfect sorting (non-compatible types, non-modulo
2968              accesses, same accesses) can lead to a group to be artificially
2969              split here as we don't just skip over those.  If it really
2970              matters we can push those to a worklist and re-iterate
2971              over them.  The we can just skip ahead to the next DR here.  */
2972
2973           /* DRs in a different loop should not be put into the same
2974              interleaving group.  */
2975           if (gimple_bb (DR_STMT (dra))->loop_father
2976               != gimple_bb (DR_STMT (drb))->loop_father)
2977             break;
2978
2979           /* Check that the data-refs have same first location (except init)
2980              and they are both either store or load (not load and store,
2981              not masked loads or stores).  */
2982           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2983               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2984                                         DR_BASE_ADDRESS (drb)) != 0
2985               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2986               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2987             break;
2988
2989           /* Check that the data-refs have the same constant size.  */
2990           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2991           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2992           if (!tree_fits_uhwi_p (sza)
2993               || !tree_fits_uhwi_p (szb)
2994               || !tree_int_cst_equal (sza, szb))
2995             break;
2996
2997           /* Check that the data-refs have the same step.  */
2998           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2999             break;
3000
3001           /* Check the types are compatible.
3002              ???  We don't distinguish this during sorting.  */
3003           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3004                                    TREE_TYPE (DR_REF (drb))))
3005             break;
3006
3007           /* Check that the DR_INITs are compile-time constants.  */
3008           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3009               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3010             break;
3011
3012           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3013           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3014           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3015           HOST_WIDE_INT init_prev
3016             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3017           gcc_assert (init_a <= init_b
3018                       && init_a <= init_prev
3019                       && init_prev <= init_b);
3020
3021           /* Do not place the same access in the interleaving chain twice.  */
3022           if (init_b == init_prev)
3023             {
3024               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3025                           < gimple_uid (DR_STMT (drb)));
3026               /* ???  For now we simply "drop" the later reference which is
3027                  otherwise the same rather than finishing off this group.
3028                  In the end we'd want to re-process duplicates forming
3029                  multiple groups from the refs, likely by just collecting
3030                  all candidates (including duplicates and split points
3031                  below) in a vector and then process them together.  */
3032               continue;
3033             }
3034
3035           /* If init_b == init_a + the size of the type * k, we have an
3036              interleaving, and DRA is accessed before DRB.  */
3037           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3038           if (type_size_a == 0
3039               || (init_b - init_a) % type_size_a != 0)
3040             break;
3041
3042           /* If we have a store, the accesses are adjacent.  This splits
3043              groups into chunks we support (we don't support vectorization
3044              of stores with gaps).  */
3045           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3046             break;
3047
3048           /* If the step (if not zero or non-constant) is greater than the
3049              difference between data-refs' inits this splits groups into
3050              suitable sizes.  */
3051           if (tree_fits_shwi_p (DR_STEP (dra)))
3052             {
3053               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3054               if (step != 0 && step <= (init_b - init_a))
3055                 break;
3056             }
3057
3058           if (dump_enabled_p ())
3059             {
3060               dump_printf_loc (MSG_NOTE, vect_location,
3061                                "Detected interleaving ");
3062               if (DR_IS_READ (dra))
3063                 dump_printf (MSG_NOTE, "load ");
3064               else
3065                 dump_printf (MSG_NOTE, "store ");
3066               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3067               dump_printf (MSG_NOTE,  " and ");
3068               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3069               dump_printf (MSG_NOTE, "\n");
3070             }
3071
3072           /* Link the found element into the group list.  */
3073           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3074             {
3075               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3076               lastinfo = stmtinfo_a;
3077             }
3078           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3079           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3080           lastinfo = stmtinfo_b;
3081         }
3082     }
3083
3084   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3085     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3086         && !vect_analyze_data_ref_access (dr))
3087       {
3088         if (dump_enabled_p ())
3089           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090                            "not vectorized: complicated access pattern.\n");
3091
3092         if (is_a <bb_vec_info> (vinfo))
3093           {
3094             /* Mark the statement as not vectorizable.  */
3095             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3096             continue;
3097           }
3098         else
3099           {
3100             datarefs_copy.release ();
3101             return false;
3102           }
3103       }
3104
3105   datarefs_copy.release ();
3106   return true;
3107 }
3108
3109 /* Function vect_vfa_segment_size.
3110
3111    Input:
3112      DR: The data reference.
3113      LENGTH_FACTOR: segment length to consider.
3114
3115    Return a value suitable for the dr_with_seg_len::seg_len field.
3116    This is the "distance travelled" by the pointer from the first
3117    iteration in the segment to the last.  Note that it does not include
3118    the size of the access; in effect it only describes the first byte.  */
3119
3120 static tree
3121 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3122 {
3123   length_factor = size_binop (MINUS_EXPR,
3124                               fold_convert (sizetype, length_factor),
3125                               size_one_node);
3126   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3127                      length_factor);
3128 }
3129
3130 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3131    gives the worst-case number of bytes covered by the segment.  */
3132
3133 static unsigned HOST_WIDE_INT
3134 vect_vfa_access_size (data_reference *dr)
3135 {
3136   stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3137   tree ref_type = TREE_TYPE (DR_REF (dr));
3138   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3139   unsigned HOST_WIDE_INT access_size = ref_size;
3140   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3141     {
3142       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3143       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3144     }
3145   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3146       && (vect_supportable_dr_alignment (dr, false)
3147           == dr_explicit_realign_optimized))
3148     {
3149       /* We might access a full vector's worth.  */
3150       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3151       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3152     }
3153   return access_size;
3154 }
3155
3156 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3157
3158 static unsigned int
3159 vect_vfa_align (const data_reference *dr)
3160 {
3161   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3162 }
3163
3164 /* Function vect_no_alias_p.
3165
3166    Given data references A and B with equal base and offset, see whether
3167    the alias relation can be decided at compilation time.  Return 1 if
3168    it can and the references alias, 0 if it can and the references do
3169    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3170    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3171    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3172
3173 static int
3174 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3175                          tree segment_length_a, tree segment_length_b,
3176                          unsigned HOST_WIDE_INT access_size_a,
3177                          unsigned HOST_WIDE_INT access_size_b)
3178 {
3179   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3180   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3181   poly_uint64 const_length_a;
3182   poly_uint64 const_length_b;
3183
3184   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3185      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3186      [a, a+12) */
3187   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3188     {
3189       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3190       offset_a = (offset_a + access_size_a) - const_length_a;
3191     }
3192   else
3193     const_length_a = tree_to_poly_uint64 (segment_length_a);
3194   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3195     {
3196       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3197       offset_b = (offset_b + access_size_b) - const_length_b;
3198     }
3199   else
3200     const_length_b = tree_to_poly_uint64 (segment_length_b);
3201
3202   const_length_a += access_size_a;
3203   const_length_b += access_size_b;
3204
3205   if (ranges_known_overlap_p (offset_a, const_length_a,
3206                               offset_b, const_length_b))
3207     return 1;
3208
3209   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3210                                offset_b, const_length_b))
3211     return 0;
3212
3213   return -1;
3214 }
3215
3216 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3217    in DDR is >= VF.  */
3218
3219 static bool
3220 dependence_distance_ge_vf (data_dependence_relation *ddr,
3221                            unsigned int loop_depth, poly_uint64 vf)
3222 {
3223   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3224       || DDR_NUM_DIST_VECTS (ddr) == 0)
3225     return false;
3226
3227   /* If the dependence is exact, we should have limited the VF instead.  */
3228   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3229
3230   unsigned int i;
3231   lambda_vector dist_v;
3232   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3233     {
3234       HOST_WIDE_INT dist = dist_v[loop_depth];
3235       if (dist != 0
3236           && !(dist > 0 && DDR_REVERSED_P (ddr))
3237           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3238         return false;
3239     }
3240
3241   if (dump_enabled_p ())
3242     {
3243       dump_printf_loc (MSG_NOTE, vect_location,
3244                        "dependence distance between ");
3245       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3246       dump_printf (MSG_NOTE,  " and ");
3247       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3248       dump_printf (MSG_NOTE,  " is >= VF\n");
3249     }
3250
3251   return true;
3252 }
3253
3254 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3255
3256 static void
3257 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3258 {
3259   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3260   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3261   dump_printf (dump_kind, ") >= ");
3262   dump_dec (dump_kind, lower_bound.min_value);
3263 }
3264
3265 /* Record that the vectorized loop requires the vec_lower_bound described
3266    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3267
3268 static void
3269 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3270                         poly_uint64 min_value)
3271 {
3272   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3273   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3274     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3275       {
3276         unsigned_p &= lower_bounds[i].unsigned_p;
3277         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3278         if (lower_bounds[i].unsigned_p != unsigned_p
3279             || maybe_lt (lower_bounds[i].min_value, min_value))
3280           {
3281             lower_bounds[i].unsigned_p = unsigned_p;
3282             lower_bounds[i].min_value = min_value;
3283             if (dump_enabled_p ())
3284               {
3285                 dump_printf_loc (MSG_NOTE, vect_location,
3286                                  "updating run-time check to ");
3287                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3288                 dump_printf (MSG_NOTE, "\n");
3289               }
3290           }
3291         return;
3292       }
3293
3294   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3295   if (dump_enabled_p ())
3296     {
3297       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3298       dump_lower_bound (MSG_NOTE, lower_bound);
3299       dump_printf (MSG_NOTE, "\n");
3300     }
3301   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3302 }
3303
3304 /* Return true if it's unlikely that the step of the vectorized form of DR
3305    will span fewer than GAP bytes.  */
3306
3307 static bool
3308 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3309 {
3310   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3311   HOST_WIDE_INT count
3312     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3313   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3314     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3315   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3316 }
3317
3318 /* Return true if we know that there is no alias between DR_A and DR_B
3319    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3320    *LOWER_BOUND_OUT to this N.  */
3321
3322 static bool
3323 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3324                                 poly_uint64 *lower_bound_out)
3325 {
3326   /* Check that there is a constant gap of known sign between DR_A
3327      and DR_B.  */
3328   poly_int64 init_a, init_b;
3329   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3330       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3331       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3332       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3333       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3334       || !ordered_p (init_a, init_b))
3335     return false;
3336
3337   /* Sort DR_A and DR_B by the address they access.  */
3338   if (maybe_lt (init_b, init_a))
3339     {
3340       std::swap (init_a, init_b);
3341       std::swap (dr_a, dr_b);
3342     }
3343
3344   /* If the two accesses could be dependent within a scalar iteration,
3345      make sure that we'd retain their order.  */
3346   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3347       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3348                                          vect_dr_stmt (dr_b)))
3349     return false;
3350
3351   /* There is no alias if abs (DR_STEP) is greater than or equal to
3352      the bytes spanned by the combination of the two accesses.  */
3353   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3354   return true;
3355 }
3356
3357 /* Function vect_prune_runtime_alias_test_list.
3358
3359    Prune a list of ddrs to be tested at run-time by versioning for alias.
3360    Merge several alias checks into one if possible.
3361    Return FALSE if resulting list of ddrs is longer then allowed by
3362    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3363
3364 bool
3365 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3366 {
3367   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3368   hash_set <tree_pair_hash> compared_objects;
3369
3370   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3371   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3372     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3373   vec<vec_object_pair> &check_unequal_addrs
3374     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3375   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3376   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3377
3378   ddr_p ddr;
3379   unsigned int i;
3380   tree length_factor;
3381
3382   if (dump_enabled_p ())
3383     dump_printf_loc (MSG_NOTE, vect_location,
3384                      "=== vect_prune_runtime_alias_test_list ===\n");
3385
3386   /* Step values are irrelevant for aliasing if the number of vector
3387      iterations is equal to the number of scalar iterations (which can
3388      happen for fully-SLP loops).  */
3389   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3390
3391   if (!ignore_step_p)
3392     {
3393       /* Convert the checks for nonzero steps into bound tests.  */
3394       tree value;
3395       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3396         vect_check_lower_bound (loop_vinfo, value, true, 1);
3397     }
3398
3399   if (may_alias_ddrs.is_empty ())
3400     return true;
3401
3402   comp_alias_ddrs.create (may_alias_ddrs.length ());
3403
3404   unsigned int loop_depth
3405     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3406                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3407
3408   /* First, we collect all data ref pairs for aliasing checks.  */
3409   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3410     {
3411       int comp_res;
3412       poly_uint64 lower_bound;
3413       struct data_reference *dr_a, *dr_b;
3414       gimple *dr_group_first_a, *dr_group_first_b;
3415       tree segment_length_a, segment_length_b;
3416       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3417       unsigned int align_a, align_b;
3418       gimple *stmt_a, *stmt_b;
3419
3420       /* Ignore the alias if the VF we chose ended up being no greater
3421          than the dependence distance.  */
3422       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3423         continue;
3424
3425       if (DDR_OBJECT_A (ddr))
3426         {
3427           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3428           if (!compared_objects.add (new_pair))
3429             {
3430               if (dump_enabled_p ())
3431                 {
3432                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3433                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3434                   dump_printf (MSG_NOTE, " and ");
3435                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3436                   dump_printf (MSG_NOTE, " have different addresses\n");
3437                 }
3438               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3439             }
3440           continue;
3441         }
3442
3443       dr_a = DDR_A (ddr);
3444       stmt_a = vect_dr_stmt (DDR_A (ddr));
3445
3446       dr_b = DDR_B (ddr);
3447       stmt_b = vect_dr_stmt (DDR_B (ddr));
3448
3449       /* Skip the pair if inter-iteration dependencies are irrelevant
3450          and intra-iteration dependencies are guaranteed to be honored.  */
3451       if (ignore_step_p
3452           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3453               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3454         {
3455           if (dump_enabled_p ())
3456             {
3457               dump_printf_loc (MSG_NOTE, vect_location,
3458                                "no need for alias check between ");
3459               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3460               dump_printf (MSG_NOTE, " and ");
3461               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3462               dump_printf (MSG_NOTE, " when VF is 1\n");
3463             }
3464           continue;
3465         }
3466
3467       /* See whether we can handle the alias using a bounds check on
3468          the step, and whether that's likely to be the best approach.
3469          (It might not be, for example, if the minimum step is much larger
3470          than the number of bytes handled by one vector iteration.)  */
3471       if (!ignore_step_p
3472           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3473           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3474           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3475               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3476         {
3477           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3478           if (dump_enabled_p ())
3479             {
3480               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3481               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3482               dump_printf (MSG_NOTE, " and ");
3483               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3484               dump_printf (MSG_NOTE, " when the step ");
3485               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3486               dump_printf (MSG_NOTE, " is outside ");
3487               if (unsigned_p)
3488                 dump_printf (MSG_NOTE, "[0");
3489               else
3490                 {
3491                   dump_printf (MSG_NOTE, "(");
3492                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3493                 }
3494               dump_printf (MSG_NOTE, ", ");
3495               dump_dec (MSG_NOTE, lower_bound);
3496               dump_printf (MSG_NOTE, ")\n");
3497             }
3498           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3499                                   lower_bound);
3500           continue;
3501         }
3502
3503       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3504       if (dr_group_first_a)
3505         {
3506           stmt_a = dr_group_first_a;
3507           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3508         }
3509
3510       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3511       if (dr_group_first_b)
3512         {
3513           stmt_b = dr_group_first_b;
3514           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3515         }
3516
3517       if (ignore_step_p)
3518         {
3519           segment_length_a = size_zero_node;
3520           segment_length_b = size_zero_node;
3521         }
3522       else
3523         {
3524           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3525             length_factor = scalar_loop_iters;
3526           else
3527             length_factor = size_int (vect_factor);
3528           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3529           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3530         }
3531       access_size_a = vect_vfa_access_size (dr_a);
3532       access_size_b = vect_vfa_access_size (dr_b);
3533       align_a = vect_vfa_align (dr_a);
3534       align_b = vect_vfa_align (dr_b);
3535
3536       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3537                                         DR_BASE_ADDRESS (dr_b));
3538       if (comp_res == 0)
3539         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3540                                           DR_OFFSET (dr_b));
3541
3542       /* See whether the alias is known at compilation time.  */
3543       if (comp_res == 0
3544           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3545           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3546           && poly_int_tree_p (segment_length_a)
3547           && poly_int_tree_p (segment_length_b))
3548         {
3549           int res = vect_compile_time_alias (dr_a, dr_b,
3550                                              segment_length_a,
3551                                              segment_length_b,
3552                                              access_size_a,
3553                                              access_size_b);
3554           if (res >= 0 && dump_enabled_p ())
3555             {
3556               dump_printf_loc (MSG_NOTE, vect_location,
3557                                "can tell at compile time that ");
3558               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3559               dump_printf (MSG_NOTE, " and ");
3560               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3561               if (res == 0)
3562                 dump_printf (MSG_NOTE, " do not alias\n");
3563               else
3564                 dump_printf (MSG_NOTE, " alias\n");
3565             }
3566
3567           if (res == 0)
3568             continue;
3569
3570           if (res == 1)
3571             {
3572               if (dump_enabled_p ())
3573                 dump_printf_loc (MSG_NOTE, vect_location,
3574                                  "not vectorized: compilation time alias.\n");
3575               return false;
3576             }
3577         }
3578
3579       dr_with_seg_len_pair_t dr_with_seg_len_pair
3580         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3581          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3582
3583       /* Canonicalize pairs by sorting the two DR members.  */
3584       if (comp_res > 0)
3585         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3586
3587       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3588     }
3589
3590   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3591
3592   unsigned int count = (comp_alias_ddrs.length ()
3593                         + check_unequal_addrs.length ());
3594
3595   dump_printf_loc (MSG_NOTE, vect_location,
3596                    "improved number of alias checks from %d to %d\n",
3597                    may_alias_ddrs.length (), count);
3598   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3599     {
3600       if (dump_enabled_p ())
3601         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602                          "number of versioning for alias "
3603                          "run-time tests exceeds %d "
3604                          "(--param vect-max-version-for-alias-checks)\n",
3605                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3606       return false;
3607     }
3608
3609   return true;
3610 }
3611
3612 /* Check whether we can use an internal function for a gather load
3613    or scatter store.  READ_P is true for loads and false for stores.
3614    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3615    the type of the memory elements being loaded or stored.  OFFSET_BITS
3616    is the number of bits in each scalar offset and OFFSET_SIGN is the
3617    sign of the offset.  SCALE is the amount by which the offset should
3618    be multiplied *after* it has been converted to address width.
3619
3620    Return true if the function is supported, storing the function
3621    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3622
3623 bool
3624 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3625                           tree memory_type, unsigned int offset_bits,
3626                           signop offset_sign, int scale,
3627                           internal_fn *ifn_out, tree *element_type_out)
3628 {
3629   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3630   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3631   if (offset_bits > element_bits)
3632     /* Internal functions require the offset to be the same width as
3633        the vector elements.  We can extend narrower offsets, but it isn't
3634        safe to truncate wider offsets.  */
3635     return false;
3636
3637   if (element_bits != memory_bits)
3638     /* For now the vector elements must be the same width as the
3639        memory elements.  */
3640     return false;
3641
3642   /* Work out which function we need.  */
3643   internal_fn ifn;
3644   if (read_p)
3645     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3646   else
3647     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3648
3649   /* Test whether the target supports this combination.  */
3650   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3651                                                offset_sign, scale))
3652     return false;
3653
3654   *ifn_out = ifn;
3655   *element_type_out = TREE_TYPE (vectype);
3656   return true;
3657 }
3658
3659 /* CALL is a call to an internal gather load or scatter store function.
3660    Describe the operation in INFO.  */
3661
3662 static void
3663 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3664 {
3665   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3666   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3667   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3668
3669   info->ifn = gimple_call_internal_fn (call);
3670   info->decl = NULL_TREE;
3671   info->base = gimple_call_arg (call, 0);
3672   info->offset = gimple_call_arg (call, 1);
3673   info->offset_dt = vect_unknown_def_type;
3674   info->offset_vectype = NULL_TREE;
3675   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3676   info->element_type = TREE_TYPE (vectype);
3677   info->memory_type = TREE_TYPE (DR_REF (dr));
3678 }
3679
3680 /* Return true if a non-affine read or write in STMT is suitable for a
3681    gather load or scatter store.  Describe the operation in *INFO if so.  */
3682
3683 bool
3684 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3685                            gather_scatter_info *info)
3686 {
3687   HOST_WIDE_INT scale = 1;
3688   poly_int64 pbitpos, pbitsize;
3689   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3690   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3691   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3692   tree offtype = NULL_TREE;
3693   tree decl = NULL_TREE, base, off;
3694   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3695   tree memory_type = TREE_TYPE (DR_REF (dr));
3696   machine_mode pmode;
3697   int punsignedp, reversep, pvolatilep = 0;
3698   internal_fn ifn;
3699   tree element_type;
3700   bool masked_p = false;
3701
3702   /* See whether this is already a call to a gather/scatter internal function.
3703      If not, see whether it's a masked load or store.  */
3704   gcall *call = dyn_cast <gcall *> (stmt);
3705   if (call && gimple_call_internal_p (call))
3706     {
3707       ifn = gimple_call_internal_fn (stmt);
3708       if (internal_gather_scatter_fn_p (ifn))
3709         {
3710           vect_describe_gather_scatter_call (call, info);
3711           return true;
3712         }
3713       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3714     }
3715
3716   /* True if we should aim to use internal functions rather than
3717      built-in functions.  */
3718   bool use_ifn_p = (DR_IS_READ (dr)
3719                     ? supports_vec_gather_load_p ()
3720                     : supports_vec_scatter_store_p ());
3721
3722   base = DR_REF (dr);
3723   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3724      see if we can use the def stmt of the address.  */
3725   if (masked_p
3726       && TREE_CODE (base) == MEM_REF
3727       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3728       && integer_zerop (TREE_OPERAND (base, 1))
3729       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3730     {
3731       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3732       if (is_gimple_assign (def_stmt)
3733           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3734         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3735     }
3736
3737   /* The gather and scatter builtins need address of the form
3738      loop_invariant + vector * {1, 2, 4, 8}
3739      or
3740      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3741      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3742      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3743      multiplications and additions in it.  To get a vector, we need
3744      a single SSA_NAME that will be defined in the loop and will
3745      contain everything that is not loop invariant and that can be
3746      vectorized.  The following code attempts to find such a preexistng
3747      SSA_NAME OFF and put the loop invariants into a tree BASE
3748      that can be gimplified before the loop.  */
3749   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3750                               &punsignedp, &reversep, &pvolatilep);
3751   gcc_assert (base && !reversep);
3752   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3753
3754   if (TREE_CODE (base) == MEM_REF)
3755     {
3756       if (!integer_zerop (TREE_OPERAND (base, 1)))
3757         {
3758           if (off == NULL_TREE)
3759             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3760           else
3761             off = size_binop (PLUS_EXPR, off,
3762                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3763         }
3764       base = TREE_OPERAND (base, 0);
3765     }
3766   else
3767     base = build_fold_addr_expr (base);
3768
3769   if (off == NULL_TREE)
3770     off = size_zero_node;
3771
3772   /* If base is not loop invariant, either off is 0, then we start with just
3773      the constant offset in the loop invariant BASE and continue with base
3774      as OFF, otherwise give up.
3775      We could handle that case by gimplifying the addition of base + off
3776      into some SSA_NAME and use that as off, but for now punt.  */
3777   if (!expr_invariant_in_loop_p (loop, base))
3778     {
3779       if (!integer_zerop (off))
3780         return false;
3781       off = base;
3782       base = size_int (pbytepos);
3783     }
3784   /* Otherwise put base + constant offset into the loop invariant BASE
3785      and continue with OFF.  */
3786   else
3787     {
3788       base = fold_convert (sizetype, base);
3789       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3790     }
3791
3792   /* OFF at this point may be either a SSA_NAME or some tree expression
3793      from get_inner_reference.  Try to peel off loop invariants from it
3794      into BASE as long as possible.  */
3795   STRIP_NOPS (off);
3796   while (offtype == NULL_TREE)
3797     {
3798       enum tree_code code;
3799       tree op0, op1, add = NULL_TREE;
3800
3801       if (TREE_CODE (off) == SSA_NAME)
3802         {
3803           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3804
3805           if (expr_invariant_in_loop_p (loop, off))
3806             return false;
3807
3808           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3809             break;
3810
3811           op0 = gimple_assign_rhs1 (def_stmt);
3812           code = gimple_assign_rhs_code (def_stmt);
3813           op1 = gimple_assign_rhs2 (def_stmt);
3814         }
3815       else
3816         {
3817           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3818             return false;
3819           code = TREE_CODE (off);
3820           extract_ops_from_tree (off, &code, &op0, &op1);
3821         }
3822       switch (code)
3823         {
3824         case POINTER_PLUS_EXPR:
3825         case PLUS_EXPR:
3826           if (expr_invariant_in_loop_p (loop, op0))
3827             {
3828               add = op0;
3829               off = op1;
3830             do_add:
3831               add = fold_convert (sizetype, add);
3832               if (scale != 1)
3833                 add = size_binop (MULT_EXPR, add, size_int (scale));
3834               base = size_binop (PLUS_EXPR, base, add);
3835               continue;
3836             }
3837           if (expr_invariant_in_loop_p (loop, op1))
3838             {
3839               add = op1;
3840               off = op0;
3841               goto do_add;
3842             }
3843           break;
3844         case MINUS_EXPR:
3845           if (expr_invariant_in_loop_p (loop, op1))
3846             {
3847               add = fold_convert (sizetype, op1);
3848               add = size_binop (MINUS_EXPR, size_zero_node, add);
3849               off = op0;
3850               goto do_add;
3851             }
3852           break;
3853         case MULT_EXPR:
3854           if (scale == 1 && tree_fits_shwi_p (op1))
3855             {
3856               int new_scale = tree_to_shwi (op1);
3857               /* Only treat this as a scaling operation if the target
3858                  supports it.  */
3859               if (use_ifn_p
3860                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3861                                                 vectype, memory_type, 1,
3862                                                 TYPE_SIGN (TREE_TYPE (op0)),
3863                                                 new_scale, &ifn,
3864                                                 &element_type))
3865                 break;
3866               scale = new_scale;
3867               off = op0;
3868               continue;
3869             }
3870           break;
3871         case SSA_NAME:
3872           off = op0;
3873           continue;
3874         CASE_CONVERT:
3875           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3876               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3877             break;
3878           if (TYPE_PRECISION (TREE_TYPE (op0))
3879               == TYPE_PRECISION (TREE_TYPE (off)))
3880             {
3881               off = op0;
3882               continue;
3883             }
3884
3885           /* The internal functions need the offset to be the same width
3886              as the elements of VECTYPE.  Don't include operations that
3887              cast the offset from that width to a different width.  */
3888           if (use_ifn_p
3889               && (int_size_in_bytes (TREE_TYPE (vectype))
3890                   == int_size_in_bytes (TREE_TYPE (off))))
3891             break;
3892
3893           if (TYPE_PRECISION (TREE_TYPE (op0))
3894               < TYPE_PRECISION (TREE_TYPE (off)))
3895             {
3896               off = op0;
3897               offtype = TREE_TYPE (off);
3898               STRIP_NOPS (off);
3899               continue;
3900             }
3901           break;
3902         default:
3903           break;
3904         }
3905       break;
3906     }
3907
3908   /* If at the end OFF still isn't a SSA_NAME or isn't
3909      defined in the loop, punt.  */
3910   if (TREE_CODE (off) != SSA_NAME
3911       || expr_invariant_in_loop_p (loop, off))
3912     return false;
3913
3914   if (offtype == NULL_TREE)
3915     offtype = TREE_TYPE (off);
3916
3917   if (use_ifn_p)
3918     {
3919       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3920                                      memory_type, TYPE_PRECISION (offtype),
3921                                      TYPE_SIGN (offtype), scale, &ifn,
3922                                      &element_type))
3923         return false;
3924     }
3925   else
3926     {
3927       if (DR_IS_READ (dr))
3928         {
3929           if (targetm.vectorize.builtin_gather)
3930             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3931         }
3932       else
3933         {
3934           if (targetm.vectorize.builtin_scatter)
3935             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3936         }
3937
3938       if (!decl)
3939         return false;
3940
3941       ifn = IFN_LAST;
3942       element_type = TREE_TYPE (vectype);
3943     }
3944
3945   info->ifn = ifn;
3946   info->decl = decl;
3947   info->base = base;
3948   info->offset = off;
3949   info->offset_dt = vect_unknown_def_type;
3950   info->offset_vectype = NULL_TREE;
3951   info->scale = scale;
3952   info->element_type = element_type;
3953   info->memory_type = memory_type;
3954   return true;
3955 }
3956
3957 /* Find the data references in STMT, analyze them with respect to LOOP and
3958    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3959    be handled.  */
3960
3961 bool
3962 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3963                                vec<data_reference_p> *datarefs)
3964 {
3965   /* We can ignore clobbers for dataref analysis - they are removed during
3966      loop vectorization and BB vectorization checks dependences with a
3967      stmt walk.  */
3968   if (gimple_clobber_p (stmt))
3969     return true;
3970
3971   if (gimple_has_volatile_ops (stmt))
3972     {
3973       if (dump_enabled_p ())
3974         {
3975           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3976                            "not vectorized: volatile type ");
3977           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3978         }
3979       return false;
3980     }
3981
3982   if (stmt_can_throw_internal (stmt))
3983     {
3984       if (dump_enabled_p ())
3985         {
3986           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3987                            "not vectorized: statement can throw an "
3988                            "exception ");
3989           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3990         }
3991       return false;
3992     }
3993
3994   auto_vec<data_reference_p, 2> refs;
3995   if (!find_data_references_in_stmt (loop, stmt, &refs))
3996     return false;
3997
3998   if (refs.is_empty ())
3999     return true;
4000
4001   if (refs.length () > 1)
4002     {
4003       if (dump_enabled_p ())
4004         {
4005           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4006                            "not vectorized: more than one data ref "
4007                            "in stmt: ");
4008           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4009         }
4010       return false;
4011     }
4012
4013   if (gcall *call = dyn_cast <gcall *> (stmt))
4014     if (!gimple_call_internal_p (call)
4015         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4016             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4017       {
4018         if (dump_enabled_p ())
4019           {
4020             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4021                              "not vectorized: dr in a call ");
4022             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4023           }
4024         return false;
4025       }
4026
4027   data_reference_p dr = refs.pop ();
4028   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4029       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4030     {
4031       if (dump_enabled_p ())
4032         {
4033           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4034                            "not vectorized: statement is bitfield "
4035                            "access ");
4036           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4037         }
4038       return false;
4039     }
4040
4041   if (DR_BASE_ADDRESS (dr)
4042       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4043     {
4044       if (dump_enabled_p ())
4045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4046                          "not vectorized: base addr of dr is a "
4047                          "constant\n");
4048       return false;
4049     }
4050
4051   datarefs->safe_push (dr);
4052   return true;
4053 }
4054
4055 /* Function vect_analyze_data_refs.
4056
4057   Find all the data references in the loop or basic block.
4058
4059    The general structure of the analysis of data refs in the vectorizer is as
4060    follows:
4061    1- vect_analyze_data_refs(loop/bb): call
4062       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4063       in the loop/bb and their dependences.
4064    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4065    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4066    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4067
4068 */
4069
4070 bool
4071 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4072 {
4073   struct loop *loop = NULL;
4074   unsigned int i;
4075   struct data_reference *dr;
4076   tree scalar_type;
4077
4078   if (dump_enabled_p ())
4079     dump_printf_loc (MSG_NOTE, vect_location,
4080                      "=== vect_analyze_data_refs ===\n");
4081
4082   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4083     loop = LOOP_VINFO_LOOP (loop_vinfo);
4084
4085   /* Go through the data-refs, check that the analysis succeeded.  Update
4086      pointer from stmt_vec_info struct to DR and vectype.  */
4087
4088   vec<data_reference_p> datarefs = vinfo->datarefs;
4089   FOR_EACH_VEC_ELT (datarefs, i, dr)
4090     {
4091       gimple *stmt;
4092       stmt_vec_info stmt_info;
4093       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4094       bool simd_lane_access = false;
4095       poly_uint64 vf;
4096
4097       gcc_assert (DR_REF (dr));
4098       stmt = vect_dr_stmt (dr);
4099       stmt_info = vinfo_for_stmt (stmt);
4100
4101       /* Check that analysis of the data-ref succeeded.  */
4102       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4103           || !DR_STEP (dr))
4104         {
4105           bool maybe_gather
4106             = DR_IS_READ (dr)
4107               && !TREE_THIS_VOLATILE (DR_REF (dr))
4108               && (targetm.vectorize.builtin_gather != NULL
4109                   || supports_vec_gather_load_p ());
4110           bool maybe_scatter
4111             = DR_IS_WRITE (dr)
4112               && !TREE_THIS_VOLATILE (DR_REF (dr))
4113               && (targetm.vectorize.builtin_scatter != NULL
4114                   || supports_vec_scatter_store_p ());
4115           bool maybe_simd_lane_access
4116             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4117
4118           /* If target supports vector gather loads or scatter stores, or if
4119              this might be a SIMD lane access, see if they can't be used.  */
4120           if (is_a <loop_vec_info> (vinfo)
4121               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4122               && !nested_in_vect_loop_p (loop, stmt))
4123             {
4124               struct data_reference *newdr
4125                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4126                                    DR_REF (dr), stmt, !maybe_scatter,
4127                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4128               gcc_assert (newdr != NULL && DR_REF (newdr));
4129               if (DR_BASE_ADDRESS (newdr)
4130                   && DR_OFFSET (newdr)
4131                   && DR_INIT (newdr)
4132                   && DR_STEP (newdr)
4133                   && integer_zerop (DR_STEP (newdr)))
4134                 {
4135                   if (maybe_simd_lane_access)
4136                     {
4137                       tree off = DR_OFFSET (newdr);
4138                       STRIP_NOPS (off);
4139                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4140                           && TREE_CODE (off) == MULT_EXPR
4141                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4142                         {
4143                           tree step = TREE_OPERAND (off, 1);
4144                           off = TREE_OPERAND (off, 0);
4145                           STRIP_NOPS (off);
4146                           if (CONVERT_EXPR_P (off)
4147                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4148                                                                           0)))
4149                                  < TYPE_PRECISION (TREE_TYPE (off)))
4150                             off = TREE_OPERAND (off, 0);
4151                           if (TREE_CODE (off) == SSA_NAME)
4152                             {
4153                               gimple *def = SSA_NAME_DEF_STMT (off);
4154                               tree reft = TREE_TYPE (DR_REF (newdr));
4155                               if (is_gimple_call (def)
4156                                   && gimple_call_internal_p (def)
4157                                   && (gimple_call_internal_fn (def)
4158                                       == IFN_GOMP_SIMD_LANE))
4159                                 {
4160                                   tree arg = gimple_call_arg (def, 0);
4161                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4162                                   arg = SSA_NAME_VAR (arg);
4163                                   if (arg == loop->simduid
4164                                       /* For now.  */
4165                                       && tree_int_cst_equal
4166                                            (TYPE_SIZE_UNIT (reft),
4167                                             step))
4168                                     {
4169                                       DR_OFFSET (newdr) = ssize_int (0);
4170                                       DR_STEP (newdr) = step;
4171                                       DR_OFFSET_ALIGNMENT (newdr)
4172                                         = BIGGEST_ALIGNMENT;
4173                                       DR_STEP_ALIGNMENT (newdr)
4174                                         = highest_pow2_factor (step);
4175                                       dr = newdr;
4176                                       simd_lane_access = true;
4177                                     }
4178                                 }
4179                             }
4180                         }
4181                     }
4182                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4183                     {
4184                       dr = newdr;
4185                       if (maybe_gather)
4186                         gatherscatter = GATHER;
4187                       else
4188                         gatherscatter = SCATTER;
4189                     }
4190                 }
4191               if (gatherscatter == SG_NONE && !simd_lane_access)
4192                 free_data_ref (newdr);
4193             }
4194
4195           if (gatherscatter == SG_NONE && !simd_lane_access)
4196             {
4197               if (dump_enabled_p ())
4198                 {
4199                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4200                                    "not vectorized: data ref analysis "
4201                                    "failed ");
4202                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4203                 }
4204               if (is_a <bb_vec_info> (vinfo))
4205                 {
4206                   /* In BB vectorization the ref can still participate
4207                      in dependence analysis, we just can't vectorize it.  */
4208                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209                   continue;
4210                 }
4211               return false;
4212             }
4213         }
4214
4215       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4216           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4217           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4218         {
4219           if (dump_enabled_p ())
4220             {
4221               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4222                                "not vectorized: base object not addressable "
4223                                "for stmt: ");
4224               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4225             }
4226           if (is_a <bb_vec_info> (vinfo))
4227             {
4228               /* In BB vectorization the ref can still participate
4229                  in dependence analysis, we just can't vectorize it.  */
4230               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4231               continue;
4232             }
4233           return false;
4234         }
4235
4236       if (is_a <loop_vec_info> (vinfo)
4237           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4238         {
4239           if (nested_in_vect_loop_p (loop, stmt))
4240             {
4241               if (dump_enabled_p ())
4242                 {
4243                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4244                                    "not vectorized: not suitable for strided "
4245                                    "load ");
4246                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4247                 }
4248               return false;
4249             }
4250           STMT_VINFO_STRIDED_P (stmt_info) = true;
4251         }
4252
4253       /* Update DR field in stmt_vec_info struct.  */
4254
4255       /* If the dataref is in an inner-loop of the loop that is considered for
4256          for vectorization, we also want to analyze the access relative to
4257          the outer-loop (DR contains information only relative to the
4258          inner-most enclosing loop).  We do that by building a reference to the
4259          first location accessed by the inner-loop, and analyze it relative to
4260          the outer-loop.  */
4261       if (loop && nested_in_vect_loop_p (loop, stmt))
4262         {
4263           /* Build a reference to the first location accessed by the
4264              inner loop: *(BASE + INIT + OFFSET).  By construction,
4265              this address must be invariant in the inner loop, so we
4266              can consider it as being used in the outer loop.  */
4267           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4268           tree offset = unshare_expr (DR_OFFSET (dr));
4269           tree init = unshare_expr (DR_INIT (dr));
4270           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4271                                           init, offset);
4272           tree init_addr = fold_build_pointer_plus (base, init_offset);
4273           tree init_ref = build_fold_indirect_ref (init_addr);
4274
4275           if (dump_enabled_p ())
4276             {
4277               dump_printf_loc (MSG_NOTE, vect_location,
4278                                "analyze in outer loop: ");
4279               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4280               dump_printf (MSG_NOTE, "\n");
4281             }
4282
4283           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4284                                      init_ref, loop))
4285             /* dr_analyze_innermost already explained the failure.  */
4286             return false;
4287
4288           if (dump_enabled_p ())
4289             {
4290               dump_printf_loc (MSG_NOTE, vect_location,
4291                                "\touter base_address: ");
4292               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4293                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4294               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4295               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4296                                  STMT_VINFO_DR_OFFSET (stmt_info));
4297               dump_printf (MSG_NOTE,
4298                            "\n\touter constant offset from base address: ");
4299               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4300                                  STMT_VINFO_DR_INIT (stmt_info));
4301               dump_printf (MSG_NOTE, "\n\touter step: ");
4302               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4303                                  STMT_VINFO_DR_STEP (stmt_info));
4304               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4305                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4306               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4307                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4308               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4309                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4310               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4311                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4312             }
4313         }
4314
4315       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4316       STMT_VINFO_DATA_REF (stmt_info) = dr;
4317       if (simd_lane_access)
4318         {
4319           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4320           free_data_ref (datarefs[i]);
4321           datarefs[i] = dr;
4322         }
4323
4324       /* Set vectype for STMT.  */
4325       scalar_type = TREE_TYPE (DR_REF (dr));
4326       STMT_VINFO_VECTYPE (stmt_info)
4327         = get_vectype_for_scalar_type (scalar_type);
4328       if (!STMT_VINFO_VECTYPE (stmt_info))
4329         {
4330           if (dump_enabled_p ())
4331             {
4332               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4333                                "not vectorized: no vectype for stmt: ");
4334               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4335               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4336               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4337                                  scalar_type);
4338               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4339             }
4340
4341           if (is_a <bb_vec_info> (vinfo))
4342             {
4343               /* No vector type is fine, the ref can still participate
4344                  in dependence analysis, we just can't vectorize it.  */
4345               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4346               continue;
4347             }
4348
4349           if (gatherscatter != SG_NONE || simd_lane_access)
4350             {
4351               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4352               if (gatherscatter != SG_NONE)
4353                 free_data_ref (dr);
4354             }
4355           return false;
4356         }
4357       else
4358         {
4359           if (dump_enabled_p ())
4360             {
4361               dump_printf_loc (MSG_NOTE, vect_location,
4362                                "got vectype for stmt: ");
4363               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4364               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4365                                  STMT_VINFO_VECTYPE (stmt_info));
4366               dump_printf (MSG_NOTE, "\n");
4367             }
4368         }
4369
4370       /* Adjust the minimal vectorization factor according to the
4371          vector type.  */
4372       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4373       *min_vf = upper_bound (*min_vf, vf);
4374
4375       if (gatherscatter != SG_NONE)
4376         {
4377           gather_scatter_info gs_info;
4378           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4379                                           &gs_info)
4380               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4381             {
4382               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4383               free_data_ref (dr);
4384               if (dump_enabled_p ())
4385                 {
4386                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4387                                    (gatherscatter == GATHER) ?
4388                                    "not vectorized: not suitable for gather "
4389                                    "load " :
4390                                    "not vectorized: not suitable for scatter "
4391                                    "store ");
4392                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4393                 }
4394               return false;
4395             }
4396
4397           free_data_ref (datarefs[i]);
4398           datarefs[i] = dr;
4399           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4400         }
4401     }
4402
4403   /* We used to stop processing and prune the list here.  Verify we no
4404      longer need to.  */
4405   gcc_assert (i == datarefs.length ());
4406
4407   return true;
4408 }
4409
4410
4411 /* Function vect_get_new_vect_var.
4412
4413    Returns a name for a new variable.  The current naming scheme appends the
4414    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4415    the name of vectorizer generated variables, and appends that to NAME if
4416    provided.  */
4417
4418 tree
4419 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4420 {
4421   const char *prefix;
4422   tree new_vect_var;
4423
4424   switch (var_kind)
4425   {
4426   case vect_simple_var:
4427     prefix = "vect";
4428     break;
4429   case vect_scalar_var:
4430     prefix = "stmp";
4431     break;
4432   case vect_mask_var:
4433     prefix = "mask";
4434     break;
4435   case vect_pointer_var:
4436     prefix = "vectp";
4437     break;
4438   default:
4439     gcc_unreachable ();
4440   }
4441
4442   if (name)
4443     {
4444       char* tmp = concat (prefix, "_", name, NULL);
4445       new_vect_var = create_tmp_reg (type, tmp);
4446       free (tmp);
4447     }
4448   else
4449     new_vect_var = create_tmp_reg (type, prefix);
4450
4451   return new_vect_var;
4452 }
4453
4454 /* Like vect_get_new_vect_var but return an SSA name.  */
4455
4456 tree
4457 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4458 {
4459   const char *prefix;
4460   tree new_vect_var;
4461
4462   switch (var_kind)
4463   {
4464   case vect_simple_var:
4465     prefix = "vect";
4466     break;
4467   case vect_scalar_var:
4468     prefix = "stmp";
4469     break;
4470   case vect_pointer_var:
4471     prefix = "vectp";
4472     break;
4473   default:
4474     gcc_unreachable ();
4475   }
4476
4477   if (name)
4478     {
4479       char* tmp = concat (prefix, "_", name, NULL);
4480       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4481       free (tmp);
4482     }
4483   else
4484     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4485
4486   return new_vect_var;
4487 }
4488
4489 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4490
4491 static void
4492 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4493 {
4494   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4495   int misalign = DR_MISALIGNMENT (dr);
4496   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4497     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4498   else
4499     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4500                             DR_TARGET_ALIGNMENT (dr), misalign);
4501 }
4502
4503 /* Function vect_create_addr_base_for_vector_ref.
4504
4505    Create an expression that computes the address of the first memory location
4506    that will be accessed for a data reference.
4507
4508    Input:
4509    STMT: The statement containing the data reference.
4510    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4511    OFFSET: Optional. If supplied, it is be added to the initial address.
4512    LOOP:    Specify relative to which loop-nest should the address be computed.
4513             For example, when the dataref is in an inner-loop nested in an
4514             outer-loop that is now being vectorized, LOOP can be either the
4515             outer-loop, or the inner-loop.  The first memory location accessed
4516             by the following dataref ('in' points to short):
4517
4518                 for (i=0; i<N; i++)
4519                    for (j=0; j<M; j++)
4520                      s += in[i+j]
4521
4522             is as follows:
4523             if LOOP=i_loop:     &in             (relative to i_loop)
4524             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4525    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4526             initial address.  Unlike OFFSET, which is number of elements to
4527             be added, BYTE_OFFSET is measured in bytes.
4528
4529    Output:
4530    1. Return an SSA_NAME whose value is the address of the memory location of
4531       the first vector of the data reference.
4532    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4533       these statement(s) which define the returned SSA_NAME.
4534
4535    FORNOW: We are only handling array accesses with step 1.  */
4536
4537 tree
4538 vect_create_addr_base_for_vector_ref (gimple *stmt,
4539                                       gimple_seq *new_stmt_list,
4540                                       tree offset,
4541                                       tree byte_offset)
4542 {
4543   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4544   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4545   const char *base_name;
4546   tree addr_base;
4547   tree dest;
4548   gimple_seq seq = NULL;
4549   tree vect_ptr_type;
4550   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4551   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4552   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4553
4554   tree data_ref_base = unshare_expr (drb->base_address);
4555   tree base_offset = unshare_expr (drb->offset);
4556   tree init = unshare_expr (drb->init);
4557
4558   if (loop_vinfo)
4559     base_name = get_name (data_ref_base);
4560   else
4561     {
4562       base_offset = ssize_int (0);
4563       init = ssize_int (0);
4564       base_name = get_name (DR_REF (dr));
4565     }
4566
4567   /* Create base_offset */
4568   base_offset = size_binop (PLUS_EXPR,
4569                             fold_convert (sizetype, base_offset),
4570                             fold_convert (sizetype, init));
4571
4572   if (offset)
4573     {
4574       offset = fold_build2 (MULT_EXPR, sizetype,
4575                             fold_convert (sizetype, offset), step);
4576       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4577                                  base_offset, offset);
4578     }
4579   if (byte_offset)
4580     {
4581       byte_offset = fold_convert (sizetype, byte_offset);
4582       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4583                                  base_offset, byte_offset);
4584     }
4585
4586   /* base + base_offset */
4587   if (loop_vinfo)
4588     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4589   else
4590     {
4591       addr_base = build1 (ADDR_EXPR,
4592                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4593                           unshare_expr (DR_REF (dr)));
4594     }
4595
4596   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4597   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4598   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4599   gimple_seq_add_seq (new_stmt_list, seq);
4600
4601   if (DR_PTR_INFO (dr)
4602       && TREE_CODE (addr_base) == SSA_NAME
4603       && !SSA_NAME_PTR_INFO (addr_base))
4604     {
4605       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4606       if (offset || byte_offset)
4607         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4608     }
4609
4610   if (dump_enabled_p ())
4611     {
4612       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4613       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4614       dump_printf (MSG_NOTE, "\n");
4615     }
4616
4617   return addr_base;
4618 }
4619
4620
4621 /* Function vect_create_data_ref_ptr.
4622
4623    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4624    location accessed in the loop by STMT, along with the def-use update
4625    chain to appropriately advance the pointer through the loop iterations.
4626    Also set aliasing information for the pointer.  This pointer is used by
4627    the callers to this function to create a memory reference expression for
4628    vector load/store access.
4629
4630    Input:
4631    1. STMT: a stmt that references memory. Expected to be of the form
4632          GIMPLE_ASSIGN <name, data-ref> or
4633          GIMPLE_ASSIGN <data-ref, name>.
4634    2. AGGR_TYPE: the type of the reference, which should be either a vector
4635         or an array.
4636    3. AT_LOOP: the loop where the vector memref is to be created.
4637    4. OFFSET (optional): an offset to be added to the initial address accessed
4638         by the data-ref in STMT.
4639    5. BSI: location where the new stmts are to be placed if there is no loop
4640    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4641         pointing to the initial address.
4642    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4643         to the initial address accessed by the data-ref in STMT.  This is
4644         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4645         in bytes.
4646    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4647         to the IV during each iteration of the loop.  NULL says to move
4648         by one copy of AGGR_TYPE up or down, depending on the step of the
4649         data reference.
4650
4651    Output:
4652    1. Declare a new ptr to vector_type, and have it point to the base of the
4653       data reference (initial addressed accessed by the data reference).
4654       For example, for vector of type V8HI, the following code is generated:
4655
4656       v8hi *ap;
4657       ap = (v8hi *)initial_address;
4658
4659       if OFFSET is not supplied:
4660          initial_address = &a[init];
4661       if OFFSET is supplied:
4662          initial_address = &a[init + OFFSET];
4663       if BYTE_OFFSET is supplied:
4664          initial_address = &a[init] + BYTE_OFFSET;
4665
4666       Return the initial_address in INITIAL_ADDRESS.
4667
4668    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4669       update the pointer in each iteration of the loop.
4670
4671       Return the increment stmt that updates the pointer in PTR_INCR.
4672
4673    3. Set INV_P to true if the access pattern of the data reference in the
4674       vectorized loop is invariant.  Set it to false otherwise.
4675
4676    4. Return the pointer.  */
4677
4678 tree
4679 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4680                           tree offset, tree *initial_address,
4681                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4682                           bool only_init, bool *inv_p, tree byte_offset,
4683                           tree iv_step)
4684 {
4685   const char *base_name;
4686   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4687   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4688   struct loop *loop = NULL;
4689   bool nested_in_vect_loop = false;
4690   struct loop *containing_loop = NULL;
4691   tree aggr_ptr_type;
4692   tree aggr_ptr;
4693   tree new_temp;
4694   gimple_seq new_stmt_list = NULL;
4695   edge pe = NULL;
4696   basic_block new_bb;
4697   tree aggr_ptr_init;
4698   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4699   tree aptr;
4700   gimple_stmt_iterator incr_gsi;
4701   bool insert_after;
4702   tree indx_before_incr, indx_after_incr;
4703   gimple *incr;
4704   tree step;
4705   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4706
4707   gcc_assert (iv_step != NULL_TREE
4708               || TREE_CODE (aggr_type) == ARRAY_TYPE
4709               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4710
4711   if (loop_vinfo)
4712     {
4713       loop = LOOP_VINFO_LOOP (loop_vinfo);
4714       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4715       containing_loop = (gimple_bb (stmt))->loop_father;
4716       pe = loop_preheader_edge (loop);
4717     }
4718   else
4719     {
4720       gcc_assert (bb_vinfo);
4721       only_init = true;
4722       *ptr_incr = NULL;
4723     }
4724
4725   /* Check the step (evolution) of the load in LOOP, and record
4726      whether it's invariant.  */
4727   step = vect_dr_behavior (dr)->step;
4728   if (integer_zerop (step))
4729     *inv_p = true;
4730   else
4731     *inv_p = false;
4732
4733   /* Create an expression for the first address accessed by this load
4734      in LOOP.  */
4735   base_name = get_name (DR_BASE_ADDRESS (dr));
4736
4737   if (dump_enabled_p ())
4738     {
4739       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4740       dump_printf_loc (MSG_NOTE, vect_location,
4741                        "create %s-pointer variable to type: ",
4742                        get_tree_code_name (TREE_CODE (aggr_type)));
4743       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4744       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4745         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4746       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4747         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4748       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4749         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4750       else
4751         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4752       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4753       dump_printf (MSG_NOTE, "\n");
4754     }
4755
4756   /* (1) Create the new aggregate-pointer variable.
4757      Vector and array types inherit the alias set of their component
4758      type by default so we need to use a ref-all pointer if the data
4759      reference does not conflict with the created aggregated data
4760      reference because it is not addressable.  */
4761   bool need_ref_all = false;
4762   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4763                               get_alias_set (DR_REF (dr))))
4764     need_ref_all = true;
4765   /* Likewise for any of the data references in the stmt group.  */
4766   else if (DR_GROUP_SIZE (stmt_info) > 1)
4767     {
4768       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4769       do
4770         {
4771           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4772           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4773           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4774                                       get_alias_set (DR_REF (sdr))))
4775             {
4776               need_ref_all = true;
4777               break;
4778             }
4779           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4780         }
4781       while (orig_stmt);
4782     }
4783   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4784                                                need_ref_all);
4785   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4786
4787
4788   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4789      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4790      def-use update cycles for the pointer: one relative to the outer-loop
4791      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4792      to the inner-loop (which is the inner-most loop containing the dataref),
4793      and this is done be step (5) below.
4794
4795      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4796      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4797      redundant.  Steps (3),(4) create the following:
4798
4799         vp0 = &base_addr;
4800         LOOP:   vp1 = phi(vp0,vp2)
4801                 ...
4802                 ...
4803                 vp2 = vp1 + step
4804                 goto LOOP
4805
4806      If there is an inner-loop nested in loop, then step (5) will also be
4807      applied, and an additional update in the inner-loop will be created:
4808
4809         vp0 = &base_addr;
4810         LOOP:   vp1 = phi(vp0,vp2)
4811                 ...
4812         inner:     vp3 = phi(vp1,vp4)
4813                    vp4 = vp3 + inner_step
4814                    if () goto inner
4815                 ...
4816                 vp2 = vp1 + step
4817                 if () goto LOOP   */
4818
4819   /* (2) Calculate the initial address of the aggregate-pointer, and set
4820      the aggregate-pointer to point to it before the loop.  */
4821
4822   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4823
4824   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4825                                                    offset, byte_offset);
4826   if (new_stmt_list)
4827     {
4828       if (pe)
4829         {
4830           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4831           gcc_assert (!new_bb);
4832         }
4833       else
4834         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4835     }
4836
4837   *initial_address = new_temp;
4838   aggr_ptr_init = new_temp;
4839
4840   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4841      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4842      inner-loop nested in LOOP (during outer-loop vectorization).  */
4843
4844   /* No update in loop is required.  */
4845   if (only_init && (!loop_vinfo || at_loop == loop))
4846     aptr = aggr_ptr_init;
4847   else
4848     {
4849       if (iv_step == NULL_TREE)
4850         {
4851           /* The step of the aggregate pointer is the type size.  */
4852           iv_step = TYPE_SIZE_UNIT (aggr_type);
4853           /* One exception to the above is when the scalar step of the load in
4854              LOOP is zero. In this case the step here is also zero.  */
4855           if (*inv_p)
4856             iv_step = size_zero_node;
4857           else if (tree_int_cst_sgn (step) == -1)
4858             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4859         }
4860
4861       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4862
4863       create_iv (aggr_ptr_init,
4864                  fold_convert (aggr_ptr_type, iv_step),
4865                  aggr_ptr, loop, &incr_gsi, insert_after,
4866                  &indx_before_incr, &indx_after_incr);
4867       incr = gsi_stmt (incr_gsi);
4868       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4869
4870       /* Copy the points-to information if it exists. */
4871       if (DR_PTR_INFO (dr))
4872         {
4873           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4874           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4875         }
4876       if (ptr_incr)
4877         *ptr_incr = incr;
4878
4879       aptr = indx_before_incr;
4880     }
4881
4882   if (!nested_in_vect_loop || only_init)
4883     return aptr;
4884
4885
4886   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4887      nested in LOOP, if exists.  */
4888
4889   gcc_assert (nested_in_vect_loop);
4890   if (!only_init)
4891     {
4892       standard_iv_increment_position (containing_loop, &incr_gsi,
4893                                       &insert_after);
4894       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4895                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4896                  &indx_after_incr);
4897       incr = gsi_stmt (incr_gsi);
4898       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4899
4900       /* Copy the points-to information if it exists. */
4901       if (DR_PTR_INFO (dr))
4902         {
4903           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4904           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4905         }
4906       if (ptr_incr)
4907         *ptr_incr = incr;
4908
4909       return indx_before_incr;
4910     }
4911   else
4912     gcc_unreachable ();
4913 }
4914
4915
4916 /* Function bump_vector_ptr
4917
4918    Increment a pointer (to a vector type) by vector-size. If requested,
4919    i.e. if PTR-INCR is given, then also connect the new increment stmt
4920    to the existing def-use update-chain of the pointer, by modifying
4921    the PTR_INCR as illustrated below:
4922
4923    The pointer def-use update-chain before this function:
4924                         DATAREF_PTR = phi (p_0, p_2)
4925                         ....
4926         PTR_INCR:       p_2 = DATAREF_PTR + step
4927
4928    The pointer def-use update-chain after this function:
4929                         DATAREF_PTR = phi (p_0, p_2)
4930                         ....
4931                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4932                         ....
4933         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4934
4935    Input:
4936    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4937                  in the loop.
4938    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4939               the loop.  The increment amount across iterations is expected
4940               to be vector_size.
4941    BSI - location where the new update stmt is to be placed.
4942    STMT - the original scalar memory-access stmt that is being vectorized.
4943    BUMP - optional. The offset by which to bump the pointer. If not given,
4944           the offset is assumed to be vector_size.
4945
4946    Output: Return NEW_DATAREF_PTR as illustrated above.
4947
4948 */
4949
4950 tree
4951 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4952                  gimple *stmt, tree bump)
4953 {
4954   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4955   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4956   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4957   tree update = TYPE_SIZE_UNIT (vectype);
4958   gassign *incr_stmt;
4959   ssa_op_iter iter;
4960   use_operand_p use_p;
4961   tree new_dataref_ptr;
4962
4963   if (bump)
4964     update = bump;
4965
4966   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4967     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4968   else
4969     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4970   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4971                                    dataref_ptr, update);
4972   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4973
4974   /* Copy the points-to information if it exists. */
4975   if (DR_PTR_INFO (dr))
4976     {
4977       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4978       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4979     }
4980
4981   if (!ptr_incr)
4982     return new_dataref_ptr;
4983
4984   /* Update the vector-pointer's cross-iteration increment.  */
4985   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4986     {
4987       tree use = USE_FROM_PTR (use_p);
4988
4989       if (use == dataref_ptr)
4990         SET_USE (use_p, new_dataref_ptr);
4991       else
4992         gcc_assert (operand_equal_p (use, update, 0));
4993     }
4994
4995   return new_dataref_ptr;
4996 }
4997
4998
4999 /* Copy memory reference info such as base/clique from the SRC reference
5000    to the DEST MEM_REF.  */
5001
5002 void
5003 vect_copy_ref_info (tree dest, tree src)
5004 {
5005   if (TREE_CODE (dest) != MEM_REF)
5006     return;
5007
5008   tree src_base = src;
5009   while (handled_component_p (src_base))
5010     src_base = TREE_OPERAND (src_base, 0);
5011   if (TREE_CODE (src_base) != MEM_REF
5012       && TREE_CODE (src_base) != TARGET_MEM_REF)
5013     return;
5014
5015   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5016   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5017 }
5018
5019
5020 /* Function vect_create_destination_var.
5021
5022    Create a new temporary of type VECTYPE.  */
5023
5024 tree
5025 vect_create_destination_var (tree scalar_dest, tree vectype)
5026 {
5027   tree vec_dest;
5028   const char *name;
5029   char *new_name;
5030   tree type;
5031   enum vect_var_kind kind;
5032
5033   kind = vectype
5034     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5035     ? vect_mask_var
5036     : vect_simple_var
5037     : vect_scalar_var;
5038   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5039
5040   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5041
5042   name = get_name (scalar_dest);
5043   if (name)
5044     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5045   else
5046     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5047   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5048   free (new_name);
5049
5050   return vec_dest;
5051 }
5052
5053 /* Function vect_grouped_store_supported.
5054
5055    Returns TRUE if interleave high and interleave low permutations
5056    are supported, and FALSE otherwise.  */
5057
5058 bool
5059 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5060 {
5061   machine_mode mode = TYPE_MODE (vectype);
5062
5063   /* vect_permute_store_chain requires the group size to be equal to 3 or
5064      be a power of two.  */
5065   if (count != 3 && exact_log2 (count) == -1)
5066     {
5067       if (dump_enabled_p ())
5068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069                          "the size of the group of accesses"
5070                          " is not a power of 2 or not eqaul to 3\n");
5071       return false;
5072     }
5073
5074   /* Check that the permutation is supported.  */
5075   if (VECTOR_MODE_P (mode))
5076     {
5077       unsigned int i;
5078       if (count == 3)
5079         {
5080           unsigned int j0 = 0, j1 = 0, j2 = 0;
5081           unsigned int i, j;
5082
5083           unsigned int nelt;
5084           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5085             {
5086               if (dump_enabled_p ())
5087                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5088                                  "cannot handle groups of 3 stores for"
5089                                  " variable-length vectors\n");
5090               return false;
5091             }
5092
5093           vec_perm_builder sel (nelt, nelt, 1);
5094           sel.quick_grow (nelt);
5095           vec_perm_indices indices;
5096           for (j = 0; j < 3; j++)
5097             {
5098               int nelt0 = ((3 - j) * nelt) % 3;
5099               int nelt1 = ((3 - j) * nelt + 1) % 3;
5100               int nelt2 = ((3 - j) * nelt + 2) % 3;
5101               for (i = 0; i < nelt; i++)
5102                 {
5103                   if (3 * i + nelt0 < nelt)
5104                     sel[3 * i + nelt0] = j0++;
5105                   if (3 * i + nelt1 < nelt)
5106                     sel[3 * i + nelt1] = nelt + j1++;
5107                   if (3 * i + nelt2 < nelt)
5108                     sel[3 * i + nelt2] = 0;
5109                 }
5110               indices.new_vector (sel, 2, nelt);
5111               if (!can_vec_perm_const_p (mode, indices))
5112                 {
5113                   if (dump_enabled_p ())
5114                     dump_printf (MSG_MISSED_OPTIMIZATION,
5115                                  "permutation op not supported by target.\n");
5116                   return false;
5117                 }
5118
5119               for (i = 0; i < nelt; i++)
5120                 {
5121                   if (3 * i + nelt0 < nelt)
5122                     sel[3 * i + nelt0] = 3 * i + nelt0;
5123                   if (3 * i + nelt1 < nelt)
5124                     sel[3 * i + nelt1] = 3 * i + nelt1;
5125                   if (3 * i + nelt2 < nelt)
5126                     sel[3 * i + nelt2] = nelt + j2++;
5127                 }
5128               indices.new_vector (sel, 2, nelt);
5129               if (!can_vec_perm_const_p (mode, indices))
5130                 {
5131                   if (dump_enabled_p ())
5132                     dump_printf (MSG_MISSED_OPTIMIZATION,
5133                                  "permutation op not supported by target.\n");
5134                   return false;
5135                 }
5136             }
5137           return true;
5138         }
5139       else
5140         {
5141           /* If length is not equal to 3 then only power of 2 is supported.  */
5142           gcc_assert (pow2p_hwi (count));
5143           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5144
5145           /* The encoding has 2 interleaved stepped patterns.  */
5146           vec_perm_builder sel (nelt, 2, 3);
5147           sel.quick_grow (6);
5148           for (i = 0; i < 3; i++)
5149             {
5150               sel[i * 2] = i;
5151               sel[i * 2 + 1] = i + nelt;
5152             }
5153           vec_perm_indices indices (sel, 2, nelt);
5154           if (can_vec_perm_const_p (mode, indices))
5155             {
5156               for (i = 0; i < 6; i++)
5157                 sel[i] += exact_div (nelt, 2);
5158               indices.new_vector (sel, 2, nelt);
5159               if (can_vec_perm_const_p (mode, indices))
5160                 return true;
5161             }
5162         }
5163     }
5164
5165   if (dump_enabled_p ())
5166     dump_printf (MSG_MISSED_OPTIMIZATION,
5167                  "permutaion op not supported by target.\n");
5168   return false;
5169 }
5170
5171
5172 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5173    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5174
5175 bool
5176 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5177                             bool masked_p)
5178 {
5179   if (masked_p)
5180     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5181                                          vec_mask_store_lanes_optab,
5182                                          vectype, count);
5183   else
5184     return vect_lanes_optab_supported_p ("vec_store_lanes",
5185                                          vec_store_lanes_optab,
5186                                          vectype, count);
5187 }
5188
5189
5190 /* Function vect_permute_store_chain.
5191
5192    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5193    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5194    the data correctly for the stores.  Return the final references for stores
5195    in RESULT_CHAIN.
5196
5197    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5198    The input is 4 vectors each containing 8 elements.  We assign a number to
5199    each element, the input sequence is:
5200
5201    1st vec:   0  1  2  3  4  5  6  7
5202    2nd vec:   8  9 10 11 12 13 14 15
5203    3rd vec:  16 17 18 19 20 21 22 23
5204    4th vec:  24 25 26 27 28 29 30 31
5205
5206    The output sequence should be:
5207
5208    1st vec:  0  8 16 24  1  9 17 25
5209    2nd vec:  2 10 18 26  3 11 19 27
5210    3rd vec:  4 12 20 28  5 13 21 30
5211    4th vec:  6 14 22 30  7 15 23 31
5212
5213    i.e., we interleave the contents of the four vectors in their order.
5214
5215    We use interleave_high/low instructions to create such output.  The input of
5216    each interleave_high/low operation is two vectors:
5217    1st vec    2nd vec
5218    0 1 2 3    4 5 6 7
5219    the even elements of the result vector are obtained left-to-right from the
5220    high/low elements of the first vector.  The odd elements of the result are
5221    obtained left-to-right from the high/low elements of the second vector.
5222    The output of interleave_high will be:   0 4 1 5
5223    and of interleave_low:                   2 6 3 7
5224
5225
5226    The permutation is done in log LENGTH stages.  In each stage interleave_high
5227    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5228    where the first argument is taken from the first half of DR_CHAIN and the
5229    second argument from it's second half.
5230    In our example,
5231
5232    I1: interleave_high (1st vec, 3rd vec)
5233    I2: interleave_low (1st vec, 3rd vec)
5234    I3: interleave_high (2nd vec, 4th vec)
5235    I4: interleave_low (2nd vec, 4th vec)
5236
5237    The output for the first stage is:
5238
5239    I1:  0 16  1 17  2 18  3 19
5240    I2:  4 20  5 21  6 22  7 23
5241    I3:  8 24  9 25 10 26 11 27
5242    I4: 12 28 13 29 14 30 15 31
5243
5244    The output of the second stage, i.e. the final result is:
5245
5246    I1:  0  8 16 24  1  9 17 25
5247    I2:  2 10 18 26  3 11 19 27
5248    I3:  4 12 20 28  5 13 21 30
5249    I4:  6 14 22 30  7 15 23 31.  */
5250
5251 void
5252 vect_permute_store_chain (vec<tree> dr_chain,
5253                           unsigned int length,
5254                           gimple *stmt,
5255                           gimple_stmt_iterator *gsi,
5256                           vec<tree> *result_chain)
5257 {
5258   tree vect1, vect2, high, low;
5259   gimple *perm_stmt;
5260   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5261   tree perm_mask_low, perm_mask_high;
5262   tree data_ref;
5263   tree perm3_mask_low, perm3_mask_high;
5264   unsigned int i, j, n, log_length = exact_log2 (length);
5265
5266   result_chain->quick_grow (length);
5267   memcpy (result_chain->address (), dr_chain.address (),
5268           length * sizeof (tree));
5269
5270   if (length == 3)
5271     {
5272       /* vect_grouped_store_supported ensures that this is constant.  */
5273       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5274       unsigned int j0 = 0, j1 = 0, j2 = 0;
5275
5276       vec_perm_builder sel (nelt, nelt, 1);
5277       sel.quick_grow (nelt);
5278       vec_perm_indices indices;
5279       for (j = 0; j < 3; j++)
5280         {
5281           int nelt0 = ((3 - j) * nelt) % 3;
5282           int nelt1 = ((3 - j) * nelt + 1) % 3;
5283           int nelt2 = ((3 - j) * nelt + 2) % 3;
5284
5285           for (i = 0; i < nelt; i++)
5286             {
5287               if (3 * i + nelt0 < nelt)
5288                 sel[3 * i + nelt0] = j0++;
5289               if (3 * i + nelt1 < nelt)
5290                 sel[3 * i + nelt1] = nelt + j1++;
5291               if (3 * i + nelt2 < nelt)
5292                 sel[3 * i + nelt2] = 0;
5293             }
5294           indices.new_vector (sel, 2, nelt);
5295           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5296
5297           for (i = 0; i < nelt; i++)
5298             {
5299               if (3 * i + nelt0 < nelt)
5300                 sel[3 * i + nelt0] = 3 * i + nelt0;
5301               if (3 * i + nelt1 < nelt)
5302                 sel[3 * i + nelt1] = 3 * i + nelt1;
5303               if (3 * i + nelt2 < nelt)
5304                 sel[3 * i + nelt2] = nelt + j2++;
5305             }
5306           indices.new_vector (sel, 2, nelt);
5307           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5308
5309           vect1 = dr_chain[0];
5310           vect2 = dr_chain[1];
5311
5312           /* Create interleaving stmt:
5313              low = VEC_PERM_EXPR <vect1, vect2,
5314                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5315                                    j + 2, nelt + j + 2, *, ...}>  */
5316           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5317           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5318                                            vect2, perm3_mask_low);
5319           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5320
5321           vect1 = data_ref;
5322           vect2 = dr_chain[2];
5323           /* Create interleaving stmt:
5324              low = VEC_PERM_EXPR <vect1, vect2,
5325                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5326                                    6, 7, nelt + j + 2, ...}>  */
5327           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5328           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5329                                            vect2, perm3_mask_high);
5330           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5331           (*result_chain)[j] = data_ref;
5332         }
5333     }
5334   else
5335     {
5336       /* If length is not equal to 3 then only power of 2 is supported.  */
5337       gcc_assert (pow2p_hwi (length));
5338
5339       /* The encoding has 2 interleaved stepped patterns.  */
5340       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5341       vec_perm_builder sel (nelt, 2, 3);
5342       sel.quick_grow (6);
5343       for (i = 0; i < 3; i++)
5344         {
5345           sel[i * 2] = i;
5346           sel[i * 2 + 1] = i + nelt;
5347         }
5348         vec_perm_indices indices (sel, 2, nelt);
5349         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5350
5351         for (i = 0; i < 6; i++)
5352           sel[i] += exact_div (nelt, 2);
5353         indices.new_vector (sel, 2, nelt);
5354         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5355
5356         for (i = 0, n = log_length; i < n; i++)
5357           {
5358             for (j = 0; j < length/2; j++)
5359               {
5360                 vect1 = dr_chain[j];
5361                 vect2 = dr_chain[j+length/2];
5362
5363                 /* Create interleaving stmt:
5364                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5365                                                         ...}>  */
5366                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5367                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5368                                                  vect2, perm_mask_high);
5369                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5370                 (*result_chain)[2*j] = high;
5371
5372                 /* Create interleaving stmt:
5373                    low = VEC_PERM_EXPR <vect1, vect2,
5374                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5375                                          ...}>  */
5376                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5377                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5378                                                  vect2, perm_mask_low);
5379                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5380                 (*result_chain)[2*j+1] = low;
5381               }
5382             memcpy (dr_chain.address (), result_chain->address (),
5383                     length * sizeof (tree));
5384           }
5385     }
5386 }
5387
5388 /* Function vect_setup_realignment
5389
5390    This function is called when vectorizing an unaligned load using
5391    the dr_explicit_realign[_optimized] scheme.
5392    This function generates the following code at the loop prolog:
5393
5394       p = initial_addr;
5395    x  msq_init = *(floor(p));   # prolog load
5396       realignment_token = call target_builtin;
5397     loop:
5398    x  msq = phi (msq_init, ---)
5399
5400    The stmts marked with x are generated only for the case of
5401    dr_explicit_realign_optimized.
5402
5403    The code above sets up a new (vector) pointer, pointing to the first
5404    location accessed by STMT, and a "floor-aligned" load using that pointer.
5405    It also generates code to compute the "realignment-token" (if the relevant
5406    target hook was defined), and creates a phi-node at the loop-header bb
5407    whose arguments are the result of the prolog-load (created by this
5408    function) and the result of a load that takes place in the loop (to be
5409    created by the caller to this function).
5410
5411    For the case of dr_explicit_realign_optimized:
5412    The caller to this function uses the phi-result (msq) to create the
5413    realignment code inside the loop, and sets up the missing phi argument,
5414    as follows:
5415     loop:
5416       msq = phi (msq_init, lsq)
5417       lsq = *(floor(p'));        # load in loop
5418       result = realign_load (msq, lsq, realignment_token);
5419
5420    For the case of dr_explicit_realign:
5421     loop:
5422       msq = *(floor(p));        # load in loop
5423       p' = p + (VS-1);
5424       lsq = *(floor(p'));       # load in loop
5425       result = realign_load (msq, lsq, realignment_token);
5426
5427    Input:
5428    STMT - (scalar) load stmt to be vectorized. This load accesses
5429           a memory location that may be unaligned.
5430    BSI - place where new code is to be inserted.
5431    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5432                               is used.
5433
5434    Output:
5435    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5436                        target hook, if defined.
5437    Return value - the result of the loop-header phi node.  */
5438
5439 tree
5440 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5441                         tree *realignment_token,
5442                         enum dr_alignment_support alignment_support_scheme,
5443                         tree init_addr,
5444                         struct loop **at_loop)
5445 {
5446   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5447   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5448   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5449   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5450   struct loop *loop = NULL;
5451   edge pe = NULL;
5452   tree scalar_dest = gimple_assign_lhs (stmt);
5453   tree vec_dest;
5454   gimple *inc;
5455   tree ptr;
5456   tree data_ref;
5457   basic_block new_bb;
5458   tree msq_init = NULL_TREE;
5459   tree new_temp;
5460   gphi *phi_stmt;
5461   tree msq = NULL_TREE;
5462   gimple_seq stmts = NULL;
5463   bool inv_p;
5464   bool compute_in_loop = false;
5465   bool nested_in_vect_loop = false;
5466   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5467   struct loop *loop_for_initial_load = NULL;
5468
5469   if (loop_vinfo)
5470     {
5471       loop = LOOP_VINFO_LOOP (loop_vinfo);
5472       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5473     }
5474
5475   gcc_assert (alignment_support_scheme == dr_explicit_realign
5476               || alignment_support_scheme == dr_explicit_realign_optimized);
5477
5478   /* We need to generate three things:
5479      1. the misalignment computation
5480      2. the extra vector load (for the optimized realignment scheme).
5481      3. the phi node for the two vectors from which the realignment is
5482       done (for the optimized realignment scheme).  */
5483
5484   /* 1. Determine where to generate the misalignment computation.
5485
5486      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5487      calculation will be generated by this function, outside the loop (in the
5488      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5489      caller, inside the loop.
5490
5491      Background: If the misalignment remains fixed throughout the iterations of
5492      the loop, then both realignment schemes are applicable, and also the
5493      misalignment computation can be done outside LOOP.  This is because we are
5494      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5495      are a multiple of VS (the Vector Size), and therefore the misalignment in
5496      different vectorized LOOP iterations is always the same.
5497      The problem arises only if the memory access is in an inner-loop nested
5498      inside LOOP, which is now being vectorized using outer-loop vectorization.
5499      This is the only case when the misalignment of the memory access may not
5500      remain fixed throughout the iterations of the inner-loop (as explained in
5501      detail in vect_supportable_dr_alignment).  In this case, not only is the
5502      optimized realignment scheme not applicable, but also the misalignment
5503      computation (and generation of the realignment token that is passed to
5504      REALIGN_LOAD) have to be done inside the loop.
5505
5506      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5507      or not, which in turn determines if the misalignment is computed inside
5508      the inner-loop, or outside LOOP.  */
5509
5510   if (init_addr != NULL_TREE || !loop_vinfo)
5511     {
5512       compute_in_loop = true;
5513       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5514     }
5515
5516
5517   /* 2. Determine where to generate the extra vector load.
5518
5519      For the optimized realignment scheme, instead of generating two vector
5520      loads in each iteration, we generate a single extra vector load in the
5521      preheader of the loop, and in each iteration reuse the result of the
5522      vector load from the previous iteration.  In case the memory access is in
5523      an inner-loop nested inside LOOP, which is now being vectorized using
5524      outer-loop vectorization, we need to determine whether this initial vector
5525      load should be generated at the preheader of the inner-loop, or can be
5526      generated at the preheader of LOOP.  If the memory access has no evolution
5527      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5528      to be generated inside LOOP (in the preheader of the inner-loop).  */
5529
5530   if (nested_in_vect_loop)
5531     {
5532       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5533       bool invariant_in_outerloop =
5534             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5535       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5536     }
5537   else
5538     loop_for_initial_load = loop;
5539   if (at_loop)
5540     *at_loop = loop_for_initial_load;
5541
5542   if (loop_for_initial_load)
5543     pe = loop_preheader_edge (loop_for_initial_load);
5544
5545   /* 3. For the case of the optimized realignment, create the first vector
5546       load at the loop preheader.  */
5547
5548   if (alignment_support_scheme == dr_explicit_realign_optimized)
5549     {
5550       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5551       gassign *new_stmt;
5552
5553       gcc_assert (!compute_in_loop);
5554       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5555       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5556                                       NULL_TREE, &init_addr, NULL, &inc,
5557                                       true, &inv_p);
5558       if (TREE_CODE (ptr) == SSA_NAME)
5559         new_temp = copy_ssa_name (ptr);
5560       else
5561         new_temp = make_ssa_name (TREE_TYPE (ptr));
5562       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5563       new_stmt = gimple_build_assign
5564                    (new_temp, BIT_AND_EXPR, ptr,
5565                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5566       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5567       gcc_assert (!new_bb);
5568       data_ref
5569         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5570                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5571       vect_copy_ref_info (data_ref, DR_REF (dr));
5572       new_stmt = gimple_build_assign (vec_dest, data_ref);
5573       new_temp = make_ssa_name (vec_dest, new_stmt);
5574       gimple_assign_set_lhs (new_stmt, new_temp);
5575       if (pe)
5576         {
5577           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5578           gcc_assert (!new_bb);
5579         }
5580       else
5581          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5582
5583       msq_init = gimple_assign_lhs (new_stmt);
5584     }
5585
5586   /* 4. Create realignment token using a target builtin, if available.
5587       It is done either inside the containing loop, or before LOOP (as
5588       determined above).  */
5589
5590   if (targetm.vectorize.builtin_mask_for_load)
5591     {
5592       gcall *new_stmt;
5593       tree builtin_decl;
5594
5595       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5596       if (!init_addr)
5597         {
5598           /* Generate the INIT_ADDR computation outside LOOP.  */
5599           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5600                                                             NULL_TREE);
5601           if (loop)
5602             {
5603               pe = loop_preheader_edge (loop);
5604               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5605               gcc_assert (!new_bb);
5606             }
5607           else
5608              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5609         }
5610
5611       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5612       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5613       vec_dest =
5614         vect_create_destination_var (scalar_dest,
5615                                      gimple_call_return_type (new_stmt));
5616       new_temp = make_ssa_name (vec_dest, new_stmt);
5617       gimple_call_set_lhs (new_stmt, new_temp);
5618
5619       if (compute_in_loop)
5620         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5621       else
5622         {
5623           /* Generate the misalignment computation outside LOOP.  */
5624           pe = loop_preheader_edge (loop);
5625           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5626           gcc_assert (!new_bb);
5627         }
5628
5629       *realignment_token = gimple_call_lhs (new_stmt);
5630
5631       /* The result of the CALL_EXPR to this builtin is determined from
5632          the value of the parameter and no global variables are touched
5633          which makes the builtin a "const" function.  Requiring the
5634          builtin to have the "const" attribute makes it unnecessary
5635          to call mark_call_clobbered.  */
5636       gcc_assert (TREE_READONLY (builtin_decl));
5637     }
5638
5639   if (alignment_support_scheme == dr_explicit_realign)
5640     return msq;
5641
5642   gcc_assert (!compute_in_loop);
5643   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5644
5645
5646   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5647
5648   pe = loop_preheader_edge (containing_loop);
5649   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5650   msq = make_ssa_name (vec_dest);
5651   phi_stmt = create_phi_node (msq, containing_loop->header);
5652   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5653
5654   return msq;
5655 }
5656
5657
5658 /* Function vect_grouped_load_supported.
5659
5660    COUNT is the size of the load group (the number of statements plus the
5661    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5662    only one statement, with a gap of COUNT - 1.
5663
5664    Returns true if a suitable permute exists.  */
5665
5666 bool
5667 vect_grouped_load_supported (tree vectype, bool single_element_p,
5668                              unsigned HOST_WIDE_INT count)
5669 {
5670   machine_mode mode = TYPE_MODE (vectype);
5671
5672   /* If this is single-element interleaving with an element distance
5673      that leaves unused vector loads around punt - we at least create
5674      very sub-optimal code in that case (and blow up memory,
5675      see PR65518).  */
5676   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5677     {
5678       if (dump_enabled_p ())
5679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680                          "single-element interleaving not supported "
5681                          "for not adjacent vector loads\n");
5682       return false;
5683     }
5684
5685   /* vect_permute_load_chain requires the group size to be equal to 3 or
5686      be a power of two.  */
5687   if (count != 3 && exact_log2 (count) == -1)
5688     {
5689       if (dump_enabled_p ())
5690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691                          "the size of the group of accesses"
5692                          " is not a power of 2 or not equal to 3\n");
5693       return false;
5694     }
5695
5696   /* Check that the permutation is supported.  */
5697   if (VECTOR_MODE_P (mode))
5698     {
5699       unsigned int i, j;
5700       if (count == 3)
5701         {
5702           unsigned int nelt;
5703           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5704             {
5705               if (dump_enabled_p ())
5706                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707                                  "cannot handle groups of 3 loads for"
5708                                  " variable-length vectors\n");
5709               return false;
5710             }
5711
5712           vec_perm_builder sel (nelt, nelt, 1);
5713           sel.quick_grow (nelt);
5714           vec_perm_indices indices;
5715           unsigned int k;
5716           for (k = 0; k < 3; k++)
5717             {
5718               for (i = 0; i < nelt; i++)
5719                 if (3 * i + k < 2 * nelt)
5720                   sel[i] = 3 * i + k;
5721                 else
5722                   sel[i] = 0;
5723               indices.new_vector (sel, 2, nelt);
5724               if (!can_vec_perm_const_p (mode, indices))
5725                 {
5726                   if (dump_enabled_p ())
5727                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728                                      "shuffle of 3 loads is not supported by"
5729                                      " target\n");
5730                   return false;
5731                 }
5732               for (i = 0, j = 0; i < nelt; i++)
5733                 if (3 * i + k < 2 * nelt)
5734                   sel[i] = i;
5735                 else
5736                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5737               indices.new_vector (sel, 2, nelt);
5738               if (!can_vec_perm_const_p (mode, indices))
5739                 {
5740                   if (dump_enabled_p ())
5741                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5742                                      "shuffle of 3 loads is not supported by"
5743                                      " target\n");
5744                   return false;
5745                 }
5746             }
5747           return true;
5748         }
5749       else
5750         {
5751           /* If length is not equal to 3 then only power of 2 is supported.  */
5752           gcc_assert (pow2p_hwi (count));
5753           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5754
5755           /* The encoding has a single stepped pattern.  */
5756           vec_perm_builder sel (nelt, 1, 3);
5757           sel.quick_grow (3);
5758           for (i = 0; i < 3; i++)
5759             sel[i] = i * 2;
5760           vec_perm_indices indices (sel, 2, nelt);
5761           if (can_vec_perm_const_p (mode, indices))
5762             {
5763               for (i = 0; i < 3; i++)
5764                 sel[i] = i * 2 + 1;
5765               indices.new_vector (sel, 2, nelt);
5766               if (can_vec_perm_const_p (mode, indices))
5767                 return true;
5768             }
5769         }
5770     }
5771
5772   if (dump_enabled_p ())
5773     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5774                      "extract even/odd not supported by target\n");
5775   return false;
5776 }
5777
5778 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5779    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5780
5781 bool
5782 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5783                            bool masked_p)
5784 {
5785   if (masked_p)
5786     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5787                                          vec_mask_load_lanes_optab,
5788                                          vectype, count);
5789   else
5790     return vect_lanes_optab_supported_p ("vec_load_lanes",
5791                                          vec_load_lanes_optab,
5792                                          vectype, count);
5793 }
5794
5795 /* Function vect_permute_load_chain.
5796
5797    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5798    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5799    the input data correctly.  Return the final references for loads in
5800    RESULT_CHAIN.
5801
5802    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5803    The input is 4 vectors each containing 8 elements. We assign a number to each
5804    element, the input sequence is:
5805
5806    1st vec:   0  1  2  3  4  5  6  7
5807    2nd vec:   8  9 10 11 12 13 14 15
5808    3rd vec:  16 17 18 19 20 21 22 23
5809    4th vec:  24 25 26 27 28 29 30 31
5810
5811    The output sequence should be:
5812
5813    1st vec:  0 4  8 12 16 20 24 28
5814    2nd vec:  1 5  9 13 17 21 25 29
5815    3rd vec:  2 6 10 14 18 22 26 30
5816    4th vec:  3 7 11 15 19 23 27 31
5817
5818    i.e., the first output vector should contain the first elements of each
5819    interleaving group, etc.
5820
5821    We use extract_even/odd instructions to create such output.  The input of
5822    each extract_even/odd operation is two vectors
5823    1st vec    2nd vec
5824    0 1 2 3    4 5 6 7
5825
5826    and the output is the vector of extracted even/odd elements.  The output of
5827    extract_even will be:   0 2 4 6
5828    and of extract_odd:     1 3 5 7
5829
5830
5831    The permutation is done in log LENGTH stages.  In each stage extract_even
5832    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5833    their order.  In our example,
5834
5835    E1: extract_even (1st vec, 2nd vec)
5836    E2: extract_odd (1st vec, 2nd vec)
5837    E3: extract_even (3rd vec, 4th vec)
5838    E4: extract_odd (3rd vec, 4th vec)
5839
5840    The output for the first stage will be:
5841
5842    E1:  0  2  4  6  8 10 12 14
5843    E2:  1  3  5  7  9 11 13 15
5844    E3: 16 18 20 22 24 26 28 30
5845    E4: 17 19 21 23 25 27 29 31
5846
5847    In order to proceed and create the correct sequence for the next stage (or
5848    for the correct output, if the second stage is the last one, as in our
5849    example), we first put the output of extract_even operation and then the
5850    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5851    The input for the second stage is:
5852
5853    1st vec (E1):  0  2  4  6  8 10 12 14
5854    2nd vec (E3): 16 18 20 22 24 26 28 30
5855    3rd vec (E2):  1  3  5  7  9 11 13 15
5856    4th vec (E4): 17 19 21 23 25 27 29 31
5857
5858    The output of the second stage:
5859
5860    E1: 0 4  8 12 16 20 24 28
5861    E2: 2 6 10 14 18 22 26 30
5862    E3: 1 5  9 13 17 21 25 29
5863    E4: 3 7 11 15 19 23 27 31
5864
5865    And RESULT_CHAIN after reordering:
5866
5867    1st vec (E1):  0 4  8 12 16 20 24 28
5868    2nd vec (E3):  1 5  9 13 17 21 25 29
5869    3rd vec (E2):  2 6 10 14 18 22 26 30
5870    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5871
5872 static void
5873 vect_permute_load_chain (vec<tree> dr_chain,
5874                          unsigned int length,
5875                          gimple *stmt,
5876                          gimple_stmt_iterator *gsi,
5877                          vec<tree> *result_chain)
5878 {
5879   tree data_ref, first_vect, second_vect;
5880   tree perm_mask_even, perm_mask_odd;
5881   tree perm3_mask_low, perm3_mask_high;
5882   gimple *perm_stmt;
5883   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5884   unsigned int i, j, log_length = exact_log2 (length);
5885
5886   result_chain->quick_grow (length);
5887   memcpy (result_chain->address (), dr_chain.address (),
5888           length * sizeof (tree));
5889
5890   if (length == 3)
5891     {
5892       /* vect_grouped_load_supported ensures that this is constant.  */
5893       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5894       unsigned int k;
5895
5896       vec_perm_builder sel (nelt, nelt, 1);
5897       sel.quick_grow (nelt);
5898       vec_perm_indices indices;
5899       for (k = 0; k < 3; k++)
5900         {
5901           for (i = 0; i < nelt; i++)
5902             if (3 * i + k < 2 * nelt)
5903               sel[i] = 3 * i + k;
5904             else
5905               sel[i] = 0;
5906           indices.new_vector (sel, 2, nelt);
5907           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5908
5909           for (i = 0, j = 0; i < nelt; i++)
5910             if (3 * i + k < 2 * nelt)
5911               sel[i] = i;
5912             else
5913               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5914           indices.new_vector (sel, 2, nelt);
5915           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5916
5917           first_vect = dr_chain[0];
5918           second_vect = dr_chain[1];
5919
5920           /* Create interleaving stmt (low part of):
5921              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5922                                                              ...}>  */
5923           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5924           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5925                                            second_vect, perm3_mask_low);
5926           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5927
5928           /* Create interleaving stmt (high part of):
5929              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5930                                                               ...}>  */
5931           first_vect = data_ref;
5932           second_vect = dr_chain[2];
5933           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5934           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5935                                            second_vect, perm3_mask_high);
5936           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5937           (*result_chain)[k] = data_ref;
5938         }
5939     }
5940   else
5941     {
5942       /* If length is not equal to 3 then only power of 2 is supported.  */
5943       gcc_assert (pow2p_hwi (length));
5944
5945       /* The encoding has a single stepped pattern.  */
5946       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5947       vec_perm_builder sel (nelt, 1, 3);
5948       sel.quick_grow (3);
5949       for (i = 0; i < 3; ++i)
5950         sel[i] = i * 2;
5951       vec_perm_indices indices (sel, 2, nelt);
5952       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5953
5954       for (i = 0; i < 3; ++i)
5955         sel[i] = i * 2 + 1;
5956       indices.new_vector (sel, 2, nelt);
5957       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5958
5959       for (i = 0; i < log_length; i++)
5960         {
5961           for (j = 0; j < length; j += 2)
5962             {
5963               first_vect = dr_chain[j];
5964               second_vect = dr_chain[j+1];
5965
5966               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5967               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5968               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5969                                                first_vect, second_vect,
5970                                                perm_mask_even);
5971               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5972               (*result_chain)[j/2] = data_ref;
5973
5974               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5975               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5976               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5977                                                first_vect, second_vect,
5978                                                perm_mask_odd);
5979               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5980               (*result_chain)[j/2+length/2] = data_ref;
5981             }
5982           memcpy (dr_chain.address (), result_chain->address (),
5983                   length * sizeof (tree));
5984         }
5985     }
5986 }
5987
5988 /* Function vect_shift_permute_load_chain.
5989
5990    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5991    sequence of stmts to reorder the input data accordingly.
5992    Return the final references for loads in RESULT_CHAIN.
5993    Return true if successed, false otherwise.
5994
5995    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5996    The input is 3 vectors each containing 8 elements.  We assign a
5997    number to each element, the input sequence is:
5998
5999    1st vec:   0  1  2  3  4  5  6  7
6000    2nd vec:   8  9 10 11 12 13 14 15
6001    3rd vec:  16 17 18 19 20 21 22 23
6002
6003    The output sequence should be:
6004
6005    1st vec:  0 3 6  9 12 15 18 21
6006    2nd vec:  1 4 7 10 13 16 19 22
6007    3rd vec:  2 5 8 11 14 17 20 23
6008
6009    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6010
6011    First we shuffle all 3 vectors to get correct elements order:
6012
6013    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6014    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6015    3rd vec:  (16 19 22) (17 20 23) (18 21)
6016
6017    Next we unite and shift vector 3 times:
6018
6019    1st step:
6020      shift right by 6 the concatenation of:
6021      "1st vec" and  "2nd vec"
6022        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6023      "2nd vec" and  "3rd vec"
6024        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6025      "3rd vec" and  "1st vec"
6026        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6027                              | New vectors                   |
6028
6029      So that now new vectors are:
6030
6031      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6032      2nd vec:  (10 13) (16 19 22) (17 20 23)
6033      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6034
6035    2nd step:
6036      shift right by 5 the concatenation of:
6037      "1st vec" and  "3rd vec"
6038        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6039      "2nd vec" and  "1st vec"
6040        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6041      "3rd vec" and  "2nd vec"
6042        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6043                           | New vectors                   |
6044
6045      So that now new vectors are:
6046
6047      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6048      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6049      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6050
6051    3rd step:
6052      shift right by 5 the concatenation of:
6053      "1st vec" and  "1st vec"
6054        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6055      shift right by 3 the concatenation of:
6056      "2nd vec" and  "2nd vec"
6057                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6058                           | New vectors                   |
6059
6060      So that now all vectors are READY:
6061      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6062      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6063      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6064
6065    This algorithm is faster than one in vect_permute_load_chain if:
6066      1.  "shift of a concatination" is faster than general permutation.
6067          This is usually so.
6068      2.  The TARGET machine can't execute vector instructions in parallel.
6069          This is because each step of the algorithm depends on previous.
6070          The algorithm in vect_permute_load_chain is much more parallel.
6071
6072    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6073 */
6074
6075 static bool
6076 vect_shift_permute_load_chain (vec<tree> dr_chain,
6077                                unsigned int length,
6078                                gimple *stmt,
6079                                gimple_stmt_iterator *gsi,
6080                                vec<tree> *result_chain)
6081 {
6082   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6083   tree perm2_mask1, perm2_mask2, perm3_mask;
6084   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6085   gimple *perm_stmt;
6086
6087   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6088   unsigned int i;
6089   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6090   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6091
6092   unsigned HOST_WIDE_INT nelt, vf;
6093   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6094       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6095     /* Not supported for variable-length vectors.  */
6096     return false;
6097
6098   vec_perm_builder sel (nelt, nelt, 1);
6099   sel.quick_grow (nelt);
6100
6101   result_chain->quick_grow (length);
6102   memcpy (result_chain->address (), dr_chain.address (),
6103           length * sizeof (tree));
6104
6105   if (pow2p_hwi (length) && vf > 4)
6106     {
6107       unsigned int j, log_length = exact_log2 (length);
6108       for (i = 0; i < nelt / 2; ++i)
6109         sel[i] = i * 2;
6110       for (i = 0; i < nelt / 2; ++i)
6111         sel[nelt / 2 + i] = i * 2 + 1;
6112       vec_perm_indices indices (sel, 2, nelt);
6113       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6114         {
6115           if (dump_enabled_p ())
6116             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6117                              "shuffle of 2 fields structure is not \
6118                               supported by target\n");
6119           return false;
6120         }
6121       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6122
6123       for (i = 0; i < nelt / 2; ++i)
6124         sel[i] = i * 2 + 1;
6125       for (i = 0; i < nelt / 2; ++i)
6126         sel[nelt / 2 + i] = i * 2;
6127       indices.new_vector (sel, 2, nelt);
6128       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6129         {
6130           if (dump_enabled_p ())
6131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6132                              "shuffle of 2 fields structure is not \
6133                               supported by target\n");
6134           return false;
6135         }
6136       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6137
6138       /* Generating permutation constant to shift all elements.
6139          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6140       for (i = 0; i < nelt; i++)
6141         sel[i] = nelt / 2 + i;
6142       indices.new_vector (sel, 2, nelt);
6143       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6144         {
6145           if (dump_enabled_p ())
6146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147                              "shift permutation is not supported by target\n");
6148           return false;
6149         }
6150       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6151
6152       /* Generating permutation constant to select vector from 2.
6153          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6154       for (i = 0; i < nelt / 2; i++)
6155         sel[i] = i;
6156       for (i = nelt / 2; i < nelt; i++)
6157         sel[i] = nelt + i;
6158       indices.new_vector (sel, 2, nelt);
6159       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6160         {
6161           if (dump_enabled_p ())
6162             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6163                              "select is not supported by target\n");
6164           return false;
6165         }
6166       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6167
6168       for (i = 0; i < log_length; i++)
6169         {
6170           for (j = 0; j < length; j += 2)
6171             {
6172               first_vect = dr_chain[j];
6173               second_vect = dr_chain[j + 1];
6174
6175               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6176               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6177                                                first_vect, first_vect,
6178                                                perm2_mask1);
6179               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6180               vect[0] = data_ref;
6181
6182               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6183               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6184                                                second_vect, second_vect,
6185                                                perm2_mask2);
6186               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6187               vect[1] = data_ref;
6188
6189               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6190               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6191                                                vect[0], vect[1], shift1_mask);
6192               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6193               (*result_chain)[j/2 + length/2] = data_ref;
6194
6195               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6196               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6197                                                vect[0], vect[1], select_mask);
6198               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6199               (*result_chain)[j/2] = data_ref;
6200             }
6201           memcpy (dr_chain.address (), result_chain->address (),
6202                   length * sizeof (tree));
6203         }
6204       return true;
6205     }
6206   if (length == 3 && vf > 2)
6207     {
6208       unsigned int k = 0, l = 0;
6209
6210       /* Generating permutation constant to get all elements in rigth order.
6211          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6212       for (i = 0; i < nelt; i++)
6213         {
6214           if (3 * k + (l % 3) >= nelt)
6215             {
6216               k = 0;
6217               l += (3 - (nelt % 3));
6218             }
6219           sel[i] = 3 * k + (l % 3);
6220           k++;
6221         }
6222       vec_perm_indices indices (sel, 2, nelt);
6223       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6224         {
6225           if (dump_enabled_p ())
6226             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227                              "shuffle of 3 fields structure is not \
6228                               supported by target\n");
6229           return false;
6230         }
6231       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6232
6233       /* Generating permutation constant to shift all elements.
6234          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6235       for (i = 0; i < nelt; i++)
6236         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6237       indices.new_vector (sel, 2, nelt);
6238       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6239         {
6240           if (dump_enabled_p ())
6241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6242                              "shift permutation is not supported by target\n");
6243           return false;
6244         }
6245       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6246
6247       /* Generating permutation constant to shift all elements.
6248          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6249       for (i = 0; i < nelt; i++)
6250         sel[i] = 2 * (nelt / 3) + 1 + i;
6251       indices.new_vector (sel, 2, nelt);
6252       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6253         {
6254           if (dump_enabled_p ())
6255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256                              "shift permutation is not supported by target\n");
6257           return false;
6258         }
6259       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6260
6261       /* Generating permutation constant to shift all elements.
6262          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6263       for (i = 0; i < nelt; i++)
6264         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6265       indices.new_vector (sel, 2, nelt);
6266       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6267         {
6268           if (dump_enabled_p ())
6269             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6270                              "shift permutation is not supported by target\n");
6271           return false;
6272         }
6273       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6274
6275       /* Generating permutation constant to shift all elements.
6276          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6277       for (i = 0; i < nelt; i++)
6278         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6279       indices.new_vector (sel, 2, nelt);
6280       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6281         {
6282           if (dump_enabled_p ())
6283             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6284                              "shift permutation is not supported by target\n");
6285           return false;
6286         }
6287       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6288
6289       for (k = 0; k < 3; k++)
6290         {
6291           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6292           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6293                                            dr_chain[k], dr_chain[k],
6294                                            perm3_mask);
6295           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6296           vect[k] = data_ref;
6297         }
6298
6299       for (k = 0; k < 3; k++)
6300         {
6301           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6302           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6303                                            vect[k % 3], vect[(k + 1) % 3],
6304                                            shift1_mask);
6305           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6306           vect_shift[k] = data_ref;
6307         }
6308
6309       for (k = 0; k < 3; k++)
6310         {
6311           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6312           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6313                                            vect_shift[(4 - k) % 3],
6314                                            vect_shift[(3 - k) % 3],
6315                                            shift2_mask);
6316           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6317           vect[k] = data_ref;
6318         }
6319
6320       (*result_chain)[3 - (nelt % 3)] = vect[2];
6321
6322       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6323       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6324                                        vect[0], shift3_mask);
6325       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6326       (*result_chain)[nelt % 3] = data_ref;
6327
6328       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6329       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6330                                        vect[1], shift4_mask);
6331       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6332       (*result_chain)[0] = data_ref;
6333       return true;
6334     }
6335   return false;
6336 }
6337
6338 /* Function vect_transform_grouped_load.
6339
6340    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6341    to perform their permutation and ascribe the result vectorized statements to
6342    the scalar statements.
6343 */
6344
6345 void
6346 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6347                              gimple_stmt_iterator *gsi)
6348 {
6349   machine_mode mode;
6350   vec<tree> result_chain = vNULL;
6351
6352   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6353      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6354      vectors, that are ready for vector computation.  */
6355   result_chain.create (size);
6356
6357   /* If reassociation width for vector type is 2 or greater target machine can
6358      execute 2 or more vector instructions in parallel.  Otherwise try to
6359      get chain for loads group using vect_shift_permute_load_chain.  */
6360   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6361   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6362       || pow2p_hwi (size)
6363       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6364                                          gsi, &result_chain))
6365     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6366   vect_record_grouped_load_vectors (stmt, result_chain);
6367   result_chain.release ();
6368 }
6369
6370 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6371    generated as part of the vectorization of STMT.  Assign the statement
6372    for each vector to the associated scalar statement.  */
6373
6374 void
6375 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6376 {
6377   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6378   gimple *next_stmt, *new_stmt;
6379   unsigned int i, gap_count;
6380   tree tmp_data_ref;
6381
6382   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6383      Since we scan the chain starting from it's first node, their order
6384      corresponds the order of data-refs in RESULT_CHAIN.  */
6385   next_stmt = first_stmt;
6386   gap_count = 1;
6387   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6388     {
6389       if (!next_stmt)
6390         break;
6391
6392       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6393        code elimination pass later.  No need to check for the first stmt in
6394        the group, since it always exists.
6395        DR_GROUP_GAP is the number of steps in elements from the previous
6396        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6397        correspond to the gaps.  */
6398       if (next_stmt != first_stmt
6399           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6400       {
6401         gap_count++;
6402         continue;
6403       }
6404
6405       while (next_stmt)
6406         {
6407           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6408           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6409              copies, and we put the new vector statement in the first available
6410              RELATED_STMT.  */
6411           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6412             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6413           else
6414             {
6415               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6416                 {
6417                   gimple *prev_stmt =
6418                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6419                   gimple *rel_stmt =
6420                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6421                   while (rel_stmt)
6422                     {
6423                       prev_stmt = rel_stmt;
6424                       rel_stmt =
6425                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6426                     }
6427
6428                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6429                     new_stmt;
6430                 }
6431             }
6432
6433           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6434           gap_count = 1;
6435           /* If NEXT_STMT accesses the same DR as the previous statement,
6436              put the same TMP_DATA_REF as its vectorized statement; otherwise
6437              get the next data-ref from RESULT_CHAIN.  */
6438           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6439             break;
6440         }
6441     }
6442 }
6443
6444 /* Function vect_force_dr_alignment_p.
6445
6446    Returns whether the alignment of a DECL can be forced to be aligned
6447    on ALIGNMENT bit boundary.  */
6448
6449 bool
6450 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6451 {
6452   if (!VAR_P (decl))
6453     return false;
6454
6455   if (decl_in_symtab_p (decl)
6456       && !symtab_node::get (decl)->can_increase_alignment_p ())
6457     return false;
6458
6459   if (TREE_STATIC (decl))
6460     return (alignment <= MAX_OFILE_ALIGNMENT);
6461   else
6462     return (alignment <= MAX_STACK_ALIGNMENT);
6463 }
6464
6465
6466 /* Return whether the data reference DR is supported with respect to its
6467    alignment.
6468    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6469    it is aligned, i.e., check if it is possible to vectorize it with different
6470    alignment.  */
6471
6472 enum dr_alignment_support
6473 vect_supportable_dr_alignment (struct data_reference *dr,
6474                                bool check_aligned_accesses)
6475 {
6476   gimple *stmt = vect_dr_stmt (dr);
6477   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6478   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6479   machine_mode mode = TYPE_MODE (vectype);
6480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6481   struct loop *vect_loop = NULL;
6482   bool nested_in_vect_loop = false;
6483
6484   if (aligned_access_p (dr) && !check_aligned_accesses)
6485     return dr_aligned;
6486
6487   /* For now assume all conditional loads/stores support unaligned
6488      access without any special code.  */
6489   if (is_gimple_call (stmt)
6490       && gimple_call_internal_p (stmt)
6491       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6492           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6493     return dr_unaligned_supported;
6494
6495   if (loop_vinfo)
6496     {
6497       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6498       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6499     }
6500
6501   /* Possibly unaligned access.  */
6502
6503   /* We can choose between using the implicit realignment scheme (generating
6504      a misaligned_move stmt) and the explicit realignment scheme (generating
6505      aligned loads with a REALIGN_LOAD).  There are two variants to the
6506      explicit realignment scheme: optimized, and unoptimized.
6507      We can optimize the realignment only if the step between consecutive
6508      vector loads is equal to the vector size.  Since the vector memory
6509      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6510      is guaranteed that the misalignment amount remains the same throughout the
6511      execution of the vectorized loop.  Therefore, we can create the
6512      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6513      at the loop preheader.
6514
6515      However, in the case of outer-loop vectorization, when vectorizing a
6516      memory access in the inner-loop nested within the LOOP that is now being
6517      vectorized, while it is guaranteed that the misalignment of the
6518      vectorized memory access will remain the same in different outer-loop
6519      iterations, it is *not* guaranteed that is will remain the same throughout
6520      the execution of the inner-loop.  This is because the inner-loop advances
6521      with the original scalar step (and not in steps of VS).  If the inner-loop
6522      step happens to be a multiple of VS, then the misalignment remains fixed
6523      and we can use the optimized realignment scheme.  For example:
6524
6525       for (i=0; i<N; i++)
6526         for (j=0; j<M; j++)
6527           s += a[i+j];
6528
6529      When vectorizing the i-loop in the above example, the step between
6530      consecutive vector loads is 1, and so the misalignment does not remain
6531      fixed across the execution of the inner-loop, and the realignment cannot
6532      be optimized (as illustrated in the following pseudo vectorized loop):
6533
6534       for (i=0; i<N; i+=4)
6535         for (j=0; j<M; j++){
6536           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6537                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6538                          // (assuming that we start from an aligned address).
6539           }
6540
6541      We therefore have to use the unoptimized realignment scheme:
6542
6543       for (i=0; i<N; i+=4)
6544           for (j=k; j<M; j+=4)
6545           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6546                            // that the misalignment of the initial address is
6547                            // 0).
6548
6549      The loop can then be vectorized as follows:
6550
6551       for (k=0; k<4; k++){
6552         rt = get_realignment_token (&vp[k]);
6553         for (i=0; i<N; i+=4){
6554           v1 = vp[i+k];
6555           for (j=k; j<M; j+=4){
6556             v2 = vp[i+j+VS-1];
6557             va = REALIGN_LOAD <v1,v2,rt>;
6558             vs += va;
6559             v1 = v2;
6560           }
6561         }
6562     } */
6563
6564   if (DR_IS_READ (dr))
6565     {
6566       bool is_packed = false;
6567       tree type = (TREE_TYPE (DR_REF (dr)));
6568
6569       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6570           && (!targetm.vectorize.builtin_mask_for_load
6571               || targetm.vectorize.builtin_mask_for_load ()))
6572         {
6573           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6574
6575           /* If we are doing SLP then the accesses need not have the
6576              same alignment, instead it depends on the SLP group size.  */
6577           if (loop_vinfo
6578               && STMT_SLP_TYPE (stmt_info)
6579               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6580                               * DR_GROUP_SIZE (vinfo_for_stmt
6581                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6582                               TYPE_VECTOR_SUBPARTS (vectype)))
6583             ;
6584           else if (!loop_vinfo
6585                    || (nested_in_vect_loop
6586                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6587                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6588             return dr_explicit_realign;
6589           else
6590             return dr_explicit_realign_optimized;
6591         }
6592       if (!known_alignment_for_access_p (dr))
6593         is_packed = not_size_aligned (DR_REF (dr));
6594
6595       if (targetm.vectorize.support_vector_misalignment
6596             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6597         /* Can't software pipeline the loads, but can at least do them.  */
6598         return dr_unaligned_supported;
6599     }
6600   else
6601     {
6602       bool is_packed = false;
6603       tree type = (TREE_TYPE (DR_REF (dr)));
6604
6605       if (!known_alignment_for_access_p (dr))
6606         is_packed = not_size_aligned (DR_REF (dr));
6607
6608      if (targetm.vectorize.support_vector_misalignment
6609            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6610        return dr_unaligned_supported;
6611     }
6612
6613   /* Unsupported.  */
6614   return dr_unaligned_unsupported;
6615 }