gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s["
  78                              HOST_WIDE_INT_PRINT_DEC "]\n",
  79                              GET_MODE_NAME (mode), count);
  80           return false;
  81         }
  82     }
  83
  84   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  85     {
  86       if (dump_enabled_p ())
  87         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  88                          "cannot use %s<%s><%s>\n", name,
  89                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  90       return false;
  91     }
  92
  93   if (dump_enabled_p ())
  94     dump_printf_loc (MSG_NOTE, vect_location,
  95                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  96                      GET_MODE_NAME (mode));
  97
  98   return true;
  99 }
 100
 101
 102 /* Return the smallest scalar part of STMT.
 103    This is used to determine the vectype of the stmt.  We generally set the
 104    vectype according to the type of the result (lhs).  For stmts whose
 105    result-type is different than the type of the arguments (e.g., demotion,
 106    promotion), vectype will be reset appropriately (later).  Note that we have
 107    to visit the smallest datatype in this function, because that determines the
 108    VF.  If the smallest datatype in the loop is present only as the rhs of a
 109    promotion operation - we'd miss it.
 110    Such a case, where a variable of this datatype does not appear in the lhs
 111    anywhere in the loop, can only occur if it's an invariant: e.g.:
 112    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 113    invariant motion.  However, we cannot rely on invariant motion to always
 114    take invariants out of the loop, and so in the case of promotion we also
 115    have to check the rhs.
 116    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 117    types.  */
 118
 119 tree
 120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   if (is_gimple_assign (stmt)
 134       && (gimple_assign_cast_p (stmt)
 135           || gimple_assign_rhs_code (stmt) == DOT_PROD_EXPR
 136           || gimple_assign_rhs_code (stmt) == WIDEN_SUM_EXPR
 137           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 138           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 139           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 140     {
 141       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 142
 143       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 144       if (rhs < lhs)
 145         scalar_type = rhs_type;
 146     }
 147
 148   *lhs_size_unit = lhs;
 149   *rhs_size_unit = rhs;
 150   return scalar_type;
 151 }
 152
 153
 154 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 155    tested at run-time.  Return TRUE if DDR was successfully inserted.
 156    Return false if versioning is not supported.  */
 157
 158 static bool
 159 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 160 {
 161   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 162
 163   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 164     return false;
 165
 166   if (!runtime_alias_check_p (ddr, loop,
 167                               optimize_loop_nest_for_speed_p (loop)))
 168     return false;
 169
 170   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 171   return true;
 172 }
 173
 174 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 175
 176 static void
 177 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 178 {
 179   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 180   for (unsigned int i = 0; i < checks.length(); ++i)
 181     if (checks[i] == value)
 182       return;
 183
 184   if (dump_enabled_p ())
 185     {
 186       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 187       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 188       dump_printf (MSG_NOTE, " is nonzero\n");
 189     }
 190   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 191 }
 192
 193 /* Return true if we know that the order of vectorized STMT_A and
 194    vectorized STMT_B will be the same as the order of STMT_A and STMT_B.
 195    At least one of the statements is a write.  */
 196
 197 static bool
 198 vect_preserves_scalar_order_p (gimple *stmt_a, gimple *stmt_b)
 199 {
 200   stmt_vec_info stmtinfo_a = vinfo_for_stmt (stmt_a);
 201   stmt_vec_info stmtinfo_b = vinfo_for_stmt (stmt_b);
 202
 203   /* Single statements are always kept in their original order.  */
 204   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 205       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 206     return true;
 207
 208   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 209      group are emitted at the position of the first scalar load and all
 210      stores in a group are emitted at the position of the last scalar store.
 211      Thus writes will happen no earlier than their current position
 212      (but could happen later) while reads will happen no later than their
 213      current position (but could happen earlier).  Reordering is therefore
 214      only possible if the first access is a write.  */
 215   if (is_pattern_stmt_p (stmtinfo_a))
 216     stmt_a = STMT_VINFO_RELATED_STMT (stmtinfo_a);
 217   if (is_pattern_stmt_p (stmtinfo_b))
 218     stmt_b = STMT_VINFO_RELATED_STMT (stmtinfo_b);
 219   gimple *earlier_stmt = get_earlier_stmt (stmt_a, stmt_b);
 220   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt)));
 221 }
 222
 223 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 224    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 225    distances.  These distances are conservatively correct but they don't
 226    reflect a guaranteed dependence.
 227
 228    Return true if this function does all the work necessary to avoid
 229    an alias or false if the caller should use the dependence distances
 230    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 231    the depth of the loop described by LOOP_VINFO and the other arguments
 232    are as for vect_analyze_data_ref_dependence.  */
 233
 234 static bool
 235 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 236                                        loop_vec_info loop_vinfo,
 237                                        int loop_depth, unsigned int *max_vf)
 238 {
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   lambda_vector dist_v;
 241   unsigned int i;
 242   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 243     {
 244       int dist = dist_v[loop_depth];
 245       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 246         {
 247           /* If the user asserted safelen >= DIST consecutive iterations
 248              can be executed concurrently, assume independence.
 249
 250              ??? An alternative would be to add the alias check even
 251              in this case, and vectorize the fallback loop with the
 252              maximum VF set to safelen.  However, if the user has
 253              explicitly given a length, it's less likely that that
 254              would be a win.  */
 255           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 256             {
 257               if ((unsigned int) loop->safelen < *max_vf)
 258                 *max_vf = loop->safelen;
 259               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 260               continue;
 261             }
 262
 263           /* For dependence distances of 2 or more, we have the option
 264              of limiting VF or checking for an alias at runtime.
 265              Prefer to check at runtime if we can, to avoid limiting
 266              the VF unnecessarily when the bases are in fact independent.
 267
 268              Note that the alias checks will be removed if the VF ends up
 269              being small enough.  */
 270           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 271         }
 272     }
 273   return true;
 274 }
 275
 276
 277 /* Function vect_analyze_data_ref_dependence.
 278
 279    Return TRUE if there (might) exist a dependence between a memory-reference
 280    DRA and a memory-reference DRB.  When versioning for alias may check a
 281    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 282    the data dependence.  */
 283
 284 static bool
 285 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 286                                   loop_vec_info loop_vinfo,
 287                                   unsigned int *max_vf)
 288 {
 289   unsigned int i;
 290   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 291   struct data_reference *dra = DDR_A (ddr);
 292   struct data_reference *drb = DDR_B (ddr);
 293   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
 294   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
 295   lambda_vector dist_v;
 296   unsigned int loop_depth;
 297
 298   /* In loop analysis all data references should be vectorizable.  */
 299   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 300       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 301     gcc_unreachable ();
 302
 303   /* Independent data accesses.  */
 304   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 305     return false;
 306
 307   if (dra == drb
 308       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 309     return false;
 310
 311   /* We do not have to consider dependences between accesses that belong
 312      to the same group, unless the stride could be smaller than the
 313      group size.  */
 314   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 315       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 316           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 317       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 318     return false;
 319
 320   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 321      least two scalar iterations, there is always also a true dependence.
 322      As the vectorizer does not re-order loads and stores we can ignore
 323      the anti-dependence if TBAA can disambiguate both DRs similar to the
 324      case with known negative distance anti-dependences (positive
 325      distance anti-dependences would violate TBAA constraints).  */
 326   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 327        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 328       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 329                                  get_alias_set (DR_REF (drb))))
 330     return false;
 331
 332   /* Unknown data dependence.  */
 333   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 334     {
 335       /* If user asserted safelen consecutive iterations can be
 336          executed concurrently, assume independence.  */
 337       if (loop->safelen >= 2)
 338         {
 339           if ((unsigned int) loop->safelen < *max_vf)
 340             *max_vf = loop->safelen;
 341           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 342           return false;
 343         }
 344
 345       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 346           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 347         {
 348           if (dump_enabled_p ())
 349             {
 350               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 351                                "versioning for alias not supported for: "
 352                                "can't determine dependence between ");
 353               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 354                                  DR_REF (dra));
 355               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 356               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 357                                  DR_REF (drb));
 358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 359             }
 360           return true;
 361         }
 362
 363       if (dump_enabled_p ())
 364         {
 365           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 366                            "versioning for alias required: "
 367                            "can't determine dependence between ");
 368           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 369                              DR_REF (dra));
 370           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 371           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 372                              DR_REF (drb));
 373           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 374         }
 375
 376       /* Add to list of ddrs that need to be tested at run-time.  */
 377       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 378     }
 379
 380   /* Known data dependence.  */
 381   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 382     {
 383       /* If user asserted safelen consecutive iterations can be
 384          executed concurrently, assume independence.  */
 385       if (loop->safelen >= 2)
 386         {
 387           if ((unsigned int) loop->safelen < *max_vf)
 388             *max_vf = loop->safelen;
 389           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 390           return false;
 391         }
 392
 393       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 394           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 395         {
 396           if (dump_enabled_p ())
 397             {
 398               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 399                                "versioning for alias not supported for: "
 400                                "bad dist vector for ");
 401               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 402                                  DR_REF (dra));
 403               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 404               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 405                                  DR_REF (drb));
 406               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 407             }
 408           return true;
 409         }
 410
 411       if (dump_enabled_p ())
 412         {
 413           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 414                            "versioning for alias required: "
 415                            "bad dist vector for ");
 416           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 417           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 418           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 419           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 420         }
 421       /* Add to list of ddrs that need to be tested at run-time.  */
 422       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 423     }
 424
 425   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 426
 427   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 428       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 429                                                 loop_depth, max_vf))
 430     return false;
 431
 432   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 433     {
 434       int dist = dist_v[loop_depth];
 435
 436       if (dump_enabled_p ())
 437         dump_printf_loc (MSG_NOTE, vect_location,
 438                          "dependence distance  = %d.\n", dist);
 439
 440       if (dist == 0)
 441         {
 442           if (dump_enabled_p ())
 443             {
 444               dump_printf_loc (MSG_NOTE, vect_location,
 445                                "dependence distance == 0 between ");
 446               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 447               dump_printf (MSG_NOTE, " and ");
 448               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 449               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 450             }
 451
 452           /* When we perform grouped accesses and perform implicit CSE
 453              by detecting equal accesses and doing disambiguation with
 454              runtime alias tests like for
 455                 .. = a[i];
 456                 .. = a[i+1];
 457                 a[i] = ..;
 458                 a[i+1] = ..;
 459                 *p = ..;
 460                 .. = a[i];
 461                 .. = a[i+1];
 462              where we will end up loading { a[i], a[i+1] } once, make
 463              sure that inserting group loads before the first load and
 464              stores after the last store will do the right thing.
 465              Similar for groups like
 466                 a[i] = ...;
 467                 ... = a[i];
 468                 a[i+1] = ...;
 469              where loads from the group interleave with the store.  */
 470           if (!vect_preserves_scalar_order_p (vect_dr_stmt(dra),
 471                                               vect_dr_stmt (drb)))
 472             {
 473               if (dump_enabled_p ())
 474                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                                  "READ_WRITE dependence in interleaving.\n");
 476               return true;
 477             }
 478
 479           if (loop->safelen < 2)
 480             {
 481               tree indicator = dr_zero_step_indicator (dra);
 482               if (TREE_CODE (indicator) != INTEGER_CST)
 483                 vect_check_nonzero_value (loop_vinfo, indicator);
 484               else if (integer_zerop (indicator))
 485                 {
 486                   if (dump_enabled_p ())
 487                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 488                                  "access also has a zero step\n");
 489                   return true;
 490                 }
 491             }
 492           continue;
 493         }
 494
 495       if (dist > 0 && DDR_REVERSED_P (ddr))
 496         {
 497           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 498              reversed (to make distance vector positive), and the actual
 499              distance is negative.  */
 500           if (dump_enabled_p ())
 501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                              "dependence distance negative.\n");
 503           /* Record a negative dependence distance to later limit the
 504              amount of stmt copying / unrolling we can perform.
 505              Only need to handle read-after-write dependence.  */
 506           if (DR_IS_READ (drb)
 507               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 508                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 509             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 510           continue;
 511         }
 512
 513       unsigned int abs_dist = abs (dist);
 514       if (abs_dist >= 2 && abs_dist < *max_vf)
 515         {
 516           /* The dependence distance requires reduction of the maximal
 517              vectorization factor.  */
 518           *max_vf = abs (dist);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "adjusting maximal vectorization factor to %i\n",
 522                              *max_vf);
 523         }
 524
 525       if (abs_dist >= *max_vf)
 526         {
 527           /* Dependence distance does not create dependence, as far as
 528              vectorization is concerned, in this case.  */
 529           if (dump_enabled_p ())
 530             dump_printf_loc (MSG_NOTE, vect_location,
 531                              "dependence distance >= VF.\n");
 532           continue;
 533         }
 534
 535       if (dump_enabled_p ())
 536         {
 537           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 538                        "not vectorized, possible dependence "
 539                        "between data-refs ");
 540           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 541           dump_printf (MSG_NOTE,  " and ");
 542           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 543           dump_printf (MSG_NOTE,  "\n");
 544         }
 545
 546       return true;
 547     }
 548
 549   return false;
 550 }
 551
 552 /* Function vect_analyze_data_ref_dependences.
 553
 554    Examine all the data references in the loop, and make sure there do not
 555    exist any data dependences between them.  Set *MAX_VF according to
 556    the maximum vectorization factor the data dependences allow.  */
 557
 558 bool
 559 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 560                                    unsigned int *max_vf)
 561 {
 562   unsigned int i;
 563   struct data_dependence_relation *ddr;
 564
 565   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 566
 567   LOOP_VINFO_DDRS (loop_vinfo)
 568     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 569              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 570   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 571   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 572   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 573                                 &LOOP_VINFO_DDRS (loop_vinfo),
 574                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 575     return false;
 576
 577   /* For epilogues we either have no aliases or alias versioning
 578      was applied to original loop.  Therefore we may just get max_vf
 579      using VF of original loop.  */
 580   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 581     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 582   else
 583     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 584       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 585         return false;
 586
 587   return true;
 588 }
 589
 590
 591 /* Function vect_slp_analyze_data_ref_dependence.
 592
 593    Return TRUE if there (might) exist a dependence between a memory-reference
 594    DRA and a memory-reference DRB.  When versioning for alias may check a
 595    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 596    the data dependence.  */
 597
 598 static bool
 599 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 600 {
 601   struct data_reference *dra = DDR_A (ddr);
 602   struct data_reference *drb = DDR_B (ddr);
 603
 604   /* We need to check dependences of statements marked as unvectorizable
 605      as well, they still can prohibit vectorization.  */
 606
 607   /* Independent data accesses.  */
 608   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 609     return false;
 610
 611   if (dra == drb)
 612     return false;
 613
 614   /* Read-read is OK.  */
 615   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 616     return false;
 617
 618   /* If dra and drb are part of the same interleaving chain consider
 619      them independent.  */
 620   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (vect_dr_stmt (dra)))
 621       && (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dra)))
 622           == DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (drb)))))
 623     return false;
 624
 625   /* Unknown data dependence.  */
 626   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 627     {
 628       if  (dump_enabled_p ())
 629         {
 630           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 631                            "can't determine dependence between ");
 632           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 633           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 634           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 635           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 636         }
 637     }
 638   else if (dump_enabled_p ())
 639     {
 640       dump_printf_loc (MSG_NOTE, vect_location,
 641                        "determined dependence between ");
 642       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 643       dump_printf (MSG_NOTE, " and ");
 644       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 645       dump_printf (MSG_NOTE,  "\n");
 646     }
 647
 648   return true;
 649 }
 650
 651
 652 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 653    contain the vector of scalar stores of this instance if we are
 654    disambiguating the loads.  */
 655
 656 static bool
 657 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 658                                    vec<gimple *> stores, gimple *last_store)
 659 {
 660   /* This walks over all stmts involved in the SLP load/store done
 661      in NODE verifying we can sink them up to the last stmt in the
 662      group.  */
 663   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 664   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 665     {
 666       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 667       if (access == last_access)
 668         continue;
 669       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 670       ao_ref ref;
 671       bool ref_initialized_p = false;
 672       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 673            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 674         {
 675           gimple *stmt = gsi_stmt (gsi);
 676           if (! gimple_vuse (stmt)
 677               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 678             continue;
 679
 680           /* If we couldn't record a (single) data reference for this
 681              stmt we have to resort to the alias oracle.  */
 682           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 683           if (!dr_b)
 684             {
 685               /* We are moving a store or sinking a load - this means
 686                  we cannot use TBAA for disambiguation.  */
 687               if (!ref_initialized_p)
 688                 ao_ref_init (&ref, DR_REF (dr_a));
 689               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 690                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 691                 return false;
 692               continue;
 693             }
 694
 695           bool dependent = false;
 696           /* If we run into a store of this same instance (we've just
 697              marked those) then delay dependence checking until we run
 698              into the last store because this is where it will have
 699              been sunk to (and we verify if we can do that as well).  */
 700           if (gimple_visited_p (stmt))
 701             {
 702               if (stmt != last_store)
 703                 continue;
 704               unsigned i;
 705               gimple *store;
 706               FOR_EACH_VEC_ELT (stores, i, store)
 707                 {
 708                   data_reference *store_dr
 709                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 710                   ddr_p ddr = initialize_data_dependence_relation
 711                                 (dr_a, store_dr, vNULL);
 712                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 713                   free_dependence_relation (ddr);
 714                   if (dependent)
 715                     break;
 716                 }
 717             }
 718           else
 719             {
 720               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 721                                                                dr_b, vNULL);
 722               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 723               free_dependence_relation (ddr);
 724             }
 725           if (dependent)
 726             return false;
 727         }
 728     }
 729   return true;
 730 }
 731
 732
 733 /* Function vect_analyze_data_ref_dependences.
 734
 735    Examine all the data references in the basic-block, and make sure there
 736    do not exist any data dependences between them.  Set *MAX_VF according to
 737    the maximum vectorization factor the data dependences allow.  */
 738
 739 bool
 740 vect_slp_analyze_instance_dependence (slp_instance instance)
 741 {
 742   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 743
 744   /* The stores of this instance are at the root of the SLP tree.  */
 745   slp_tree store = SLP_INSTANCE_TREE (instance);
 746   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 747     store = NULL;
 748
 749   /* Verify we can sink stores to the vectorized stmt insert location.  */
 750   gimple *last_store = NULL;
 751   if (store)
 752     {
 753       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 754         return false;
 755
 756       /* Mark stores in this instance and remember the last one.  */
 757       last_store = vect_find_last_scalar_stmt_in_slp (store);
 758       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 759         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 760     }
 761
 762   bool res = true;
 763
 764   /* Verify we can sink loads to the vectorized stmt insert location,
 765      special-casing stores of this instance.  */
 766   slp_tree load;
 767   unsigned int i;
 768   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 769     if (! vect_slp_analyze_node_dependences (instance, load,
 770                                              store
 771                                              ? SLP_TREE_SCALAR_STMTS (store)
 772                                              : vNULL, last_store))
 773       {
 774         res = false;
 775         break;
 776       }
 777
 778   /* Unset the visited flag.  */
 779   if (store)
 780     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 781       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 782
 783   return res;
 784 }
 785
 786 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 787    the statement that contains DRB, which is useful for recording in the
 788    dump file.  */
 789
 790 static void
 791 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 792                             innermost_loop_behavior *drb)
 793 {
 794   bool existed;
 795   innermost_loop_behavior *&entry
 796     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 797   if (!existed || entry->base_alignment < drb->base_alignment)
 798     {
 799       entry = drb;
 800       if (dump_enabled_p ())
 801         {
 802           dump_printf_loc (MSG_NOTE, vect_location,
 803                            "recording new base alignment for ");
 804           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 805           dump_printf (MSG_NOTE, "\n");
 806           dump_printf_loc (MSG_NOTE, vect_location,
 807                            "  alignment:    %d\n", drb->base_alignment);
 808           dump_printf_loc (MSG_NOTE, vect_location,
 809                            "  misalignment: %d\n", drb->base_misalignment);
 810           dump_printf_loc (MSG_NOTE, vect_location,
 811                            "  based on:     ");
 812           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 813         }
 814     }
 815 }
 816
 817 /* If the region we're going to vectorize is reached, all unconditional
 818    data references occur at least once.  We can therefore pool the base
 819    alignment guarantees from each unconditional reference.  Do this by
 820    going through all the data references in VINFO and checking whether
 821    the containing statement makes the reference unconditionally.  If so,
 822    record the alignment of the base address in VINFO so that it can be
 823    used for all other references with the same base.  */
 824
 825 void
 826 vect_record_base_alignments (vec_info *vinfo)
 827 {
 828   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 829   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 830   data_reference *dr;
 831   unsigned int i;
 832   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 833     {
 834       gimple *stmt = vect_dr_stmt (dr);
 835       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 836           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)))
 837         {
 838           vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 839
 840           /* If DR is nested in the loop that is being vectorized, we can also
 841              record the alignment of the base wrt the outer loop.  */
 842           if (loop && nested_in_vect_loop_p (loop, stmt))
 843             {
 844               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 845               vect_record_base_alignment
 846                  (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 847             }
 848         }
 849     }
 850 }
 851
 852 /* Return the target alignment for the vectorized form of DR.  */
 853
 854 static unsigned int
 855 vect_calculate_target_alignment (struct data_reference *dr)
 856 {
 857   gimple *stmt = vect_dr_stmt (dr);
 858   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 859   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 860   return targetm.vectorize.preferred_vector_alignment (vectype);
 861 }
 862
 863 /* Function vect_compute_data_ref_alignment
 864
 865    Compute the misalignment of the data reference DR.
 866
 867    Output:
 868    1. If during the misalignment computation it is found that the data reference
 869       cannot be vectorized then false is returned.
 870    2. DR_MISALIGNMENT (DR) is defined.
 871
 872    FOR NOW: No analysis is actually performed. Misalignment is calculated
 873    only for trivial cases. TODO.  */
 874
 875 static bool
 876 vect_compute_data_ref_alignment (struct data_reference *dr)
 877 {
 878   gimple *stmt = vect_dr_stmt (dr);
 879   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 880   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 881   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 882   struct loop *loop = NULL;
 883   tree ref = DR_REF (dr);
 884   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 885
 886   if (dump_enabled_p ())
 887     dump_printf_loc (MSG_NOTE, vect_location,
 888                      "vect_compute_data_ref_alignment:\n");
 889
 890   if (loop_vinfo)
 891     loop = LOOP_VINFO_LOOP (loop_vinfo);
 892
 893   /* Initialize misalignment to unknown.  */
 894   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 895
 896   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 897   bool step_preserves_misalignment_p;
 898
 899   unsigned HOST_WIDE_INT vector_alignment
 900     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 901   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 902
 903   /* No step for BB vectorization.  */
 904   if (!loop)
 905     {
 906       gcc_assert (integer_zerop (drb->step));
 907       step_preserves_misalignment_p = true;
 908     }
 909
 910   /* In case the dataref is in an inner-loop of the loop that is being
 911      vectorized (LOOP), we use the base and misalignment information
 912      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 913      stays the same throughout the execution of the inner-loop, which is why
 914      we have to check that the stride of the dataref in the inner-loop evenly
 915      divides by the vector alignment.  */
 916   else if (nested_in_vect_loop_p (loop, stmt))
 917     {
 918       step_preserves_misalignment_p
 919         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 920
 921       if (dump_enabled_p ())
 922         {
 923           if (step_preserves_misalignment_p)
 924             dump_printf_loc (MSG_NOTE, vect_location,
 925                              "inner step divides the vector alignment.\n");
 926           else
 927             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 928                              "inner step doesn't divide the vector"
 929                              " alignment.\n");
 930         }
 931     }
 932
 933   /* Similarly we can only use base and misalignment information relative to
 934      an innermost loop if the misalignment stays the same throughout the
 935      execution of the loop.  As above, this is the case if the stride of
 936      the dataref evenly divides by the alignment.  */
 937   else
 938     {
 939       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 940       step_preserves_misalignment_p
 941         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 942
 943       if (!step_preserves_misalignment_p && dump_enabled_p ())
 944         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 945                          "step doesn't divide the vector alignment.\n");
 946     }
 947
 948   unsigned int base_alignment = drb->base_alignment;
 949   unsigned int base_misalignment = drb->base_misalignment;
 950
 951   /* Calculate the maximum of the pooled base address alignment and the
 952      alignment that we can compute for DR itself.  */
 953   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 954   if (entry && base_alignment < (*entry)->base_alignment)
 955     {
 956       base_alignment = (*entry)->base_alignment;
 957       base_misalignment = (*entry)->base_misalignment;
 958     }
 959
 960   if (drb->offset_alignment < vector_alignment
 961       || !step_preserves_misalignment_p
 962       /* We need to know whether the step wrt the vectorized loop is
 963          negative when computing the starting misalignment below.  */
 964       || TREE_CODE (drb->step) != INTEGER_CST)
 965     {
 966       if (dump_enabled_p ())
 967         {
 968           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 969                            "Unknown alignment for access: ");
 970           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 971           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 972         }
 973       return true;
 974     }
 975
 976   if (base_alignment < vector_alignment)
 977     {
 978       unsigned int max_alignment;
 979       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 980       if (max_alignment < vector_alignment
 981           || !vect_can_force_dr_alignment_p (base,
 982                                              vector_alignment * BITS_PER_UNIT))
 983         {
 984           if (dump_enabled_p ())
 985             {
 986               dump_printf_loc (MSG_NOTE, vect_location,
 987                                "can't force alignment of ref: ");
 988               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 989               dump_printf (MSG_NOTE, "\n");
 990             }
 991           return true;
 992         }
 993
 994       /* Force the alignment of the decl.
 995          NOTE: This is the only change to the code we make during
 996          the analysis phase, before deciding to vectorize the loop.  */
 997       if (dump_enabled_p ())
 998         {
 999           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1000           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1001           dump_printf (MSG_NOTE, "\n");
1002         }
1003
1004       DR_VECT_AUX (dr)->base_decl = base;
1005       DR_VECT_AUX (dr)->base_misaligned = true;
1006       base_misalignment = 0;
1007     }
1008   poly_int64 misalignment
1009     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1010
1011   /* If this is a backward running DR then first access in the larger
1012      vectype actually is N-1 elements before the address in the DR.
1013      Adjust misalign accordingly.  */
1014   if (tree_int_cst_sgn (drb->step) < 0)
1015     /* PLUS because STEP is negative.  */
1016     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1017                      * TREE_INT_CST_LOW (drb->step));
1018
1019   unsigned int const_misalignment;
1020   if (!known_misalignment (misalignment, vector_alignment,
1021                            &const_misalignment))
1022     {
1023       if (dump_enabled_p ())
1024         {
1025           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026                            "Non-constant misalignment for access: ");
1027           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1028           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1029         }
1030       return true;
1031     }
1032
1033   SET_DR_MISALIGNMENT (dr, const_misalignment);
1034
1035   if (dump_enabled_p ())
1036     {
1037       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1039       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1040       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1041     }
1042
1043   return true;
1044 }
1045
1046 /* Function vect_update_misalignment_for_peel.
1047    Sets DR's misalignment
1048    - to 0 if it has the same alignment as DR_PEEL,
1049    - to the misalignment computed using NPEEL if DR's salignment is known,
1050    - to -1 (unknown) otherwise.
1051
1052    DR - the data reference whose misalignment is to be adjusted.
1053    DR_PEEL - the data reference whose misalignment is being made
1054              zero in the vector loop by the peel.
1055    NPEEL - the number of iterations in the peel loop if the misalignment
1056            of DR_PEEL is known at compile time.  */
1057
1058 static void
1059 vect_update_misalignment_for_peel (struct data_reference *dr,
1060                                    struct data_reference *dr_peel, int npeel)
1061 {
1062   unsigned int i;
1063   vec<dr_p> same_aligned_drs;
1064   struct data_reference *current_dr;
1065   int dr_size = vect_get_scalar_dr_size (dr);
1066   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1067   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
1068   stmt_vec_info peel_stmt_info = vinfo_for_stmt (vect_dr_stmt (dr_peel));
1069
1070  /* For interleaved data accesses the step in the loop must be multiplied by
1071      the size of the interleaving group.  */
1072   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1073     dr_size *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
1074   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1075     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1076
1077   /* It can be assumed that the data refs with the same alignment as dr_peel
1078      are aligned in the vector loop.  */
1079   same_aligned_drs
1080     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (vect_dr_stmt (dr_peel)));
1081   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1082     {
1083       if (current_dr != dr)
1084         continue;
1085       gcc_assert (!known_alignment_for_access_p (dr)
1086                   || !known_alignment_for_access_p (dr_peel)
1087                   || (DR_MISALIGNMENT (dr) / dr_size
1088                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1089       SET_DR_MISALIGNMENT (dr, 0);
1090       return;
1091     }
1092
1093   if (known_alignment_for_access_p (dr)
1094       && known_alignment_for_access_p (dr_peel))
1095     {
1096       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1097       int misal = DR_MISALIGNMENT (dr);
1098       misal += negative ? -npeel * dr_size : npeel * dr_size;
1099       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1100       SET_DR_MISALIGNMENT (dr, misal);
1101       return;
1102     }
1103
1104   if (dump_enabled_p ())
1105     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1106                      "to unknown (-1).\n");
1107   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1108 }
1109
1110
1111 /* Function verify_data_ref_alignment
1112
1113    Return TRUE if DR can be handled with respect to alignment.  */
1114
1115 static bool
1116 verify_data_ref_alignment (data_reference_p dr)
1117 {
1118   enum dr_alignment_support supportable_dr_alignment
1119     = vect_supportable_dr_alignment (dr, false);
1120   if (!supportable_dr_alignment)
1121     {
1122       if (dump_enabled_p ())
1123         {
1124           if (DR_IS_READ (dr))
1125             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1126                              "not vectorized: unsupported unaligned load.");
1127           else
1128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129                              "not vectorized: unsupported unaligned "
1130                              "store.");
1131
1132           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1133                              DR_REF (dr));
1134           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1135         }
1136       return false;
1137     }
1138
1139   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1140     dump_printf_loc (MSG_NOTE, vect_location,
1141                      "Vectorizing an unaligned access.\n");
1142
1143   return true;
1144 }
1145
1146 /* Function vect_verify_datarefs_alignment
1147
1148    Return TRUE if all data references in the loop can be
1149    handled with respect to alignment.  */
1150
1151 bool
1152 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1153 {
1154   vec<data_reference_p> datarefs = vinfo->datarefs;
1155   struct data_reference *dr;
1156   unsigned int i;
1157
1158   FOR_EACH_VEC_ELT (datarefs, i, dr)
1159     {
1160       gimple *stmt = vect_dr_stmt (dr);
1161       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1162
1163       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1164         continue;
1165
1166       /* For interleaving, only the alignment of the first access matters.   */
1167       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1169         continue;
1170
1171       /* Strided accesses perform only component accesses, alignment is
1172          irrelevant for them.  */
1173       if (STMT_VINFO_STRIDED_P (stmt_info)
1174           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1175         continue;
1176
1177       if (! verify_data_ref_alignment (dr))
1178         return false;
1179     }
1180
1181   return true;
1182 }
1183
1184 /* Given an memory reference EXP return whether its alignment is less
1185    than its size.  */
1186
1187 static bool
1188 not_size_aligned (tree exp)
1189 {
1190   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1191     return true;
1192
1193   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1194           > get_object_alignment (exp));
1195 }
1196
1197 /* Function vector_alignment_reachable_p
1198
1199    Return true if vector alignment for DR is reachable by peeling
1200    a few loop iterations.  Return false otherwise.  */
1201
1202 static bool
1203 vector_alignment_reachable_p (struct data_reference *dr)
1204 {
1205   gimple *stmt = vect_dr_stmt (dr);
1206   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1207   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1208
1209   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1210     {
1211       /* For interleaved access we peel only if number of iterations in
1212          the prolog loop ({VF - misalignment}), is a multiple of the
1213          number of the interleaved accesses.  */
1214       int elem_size, mis_in_elements;
1215
1216       /* FORNOW: handle only known alignment.  */
1217       if (!known_alignment_for_access_p (dr))
1218         return false;
1219
1220       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1221       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1222       elem_size = vector_element_size (vector_size, nelements);
1223       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1224
1225       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1226         return false;
1227     }
1228
1229   /* If misalignment is known at the compile time then allow peeling
1230      only if natural alignment is reachable through peeling.  */
1231   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1232     {
1233       HOST_WIDE_INT elmsize =
1234                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1235       if (dump_enabled_p ())
1236         {
1237           dump_printf_loc (MSG_NOTE, vect_location,
1238                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1239           dump_printf (MSG_NOTE,
1240                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1241         }
1242       if (DR_MISALIGNMENT (dr) % elmsize)
1243         {
1244           if (dump_enabled_p ())
1245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246                              "data size does not divide the misalignment.\n");
1247           return false;
1248         }
1249     }
1250
1251   if (!known_alignment_for_access_p (dr))
1252     {
1253       tree type = TREE_TYPE (DR_REF (dr));
1254       bool is_packed = not_size_aligned (DR_REF (dr));
1255       if (dump_enabled_p ())
1256         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257                          "Unknown misalignment, %snaturally aligned\n",
1258                          is_packed ? "not " : "");
1259       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1260     }
1261
1262   return true;
1263 }
1264
1265
1266 /* Calculate the cost of the memory access represented by DR.  */
1267
1268 static void
1269 vect_get_data_access_cost (struct data_reference *dr,
1270                            unsigned int *inside_cost,
1271                            unsigned int *outside_cost,
1272                            stmt_vector_for_cost *body_cost_vec,
1273                            stmt_vector_for_cost *prologue_cost_vec)
1274 {
1275   gimple *stmt = vect_dr_stmt (dr);
1276   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278   int ncopies;
1279
1280   if (PURE_SLP_STMT (stmt_info))
1281     ncopies = 1;
1282   else
1283     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285   if (DR_IS_READ (dr))
1286     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287                         prologue_cost_vec, body_cost_vec, false);
1288   else
1289     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "vect_get_data_access_cost: inside_cost = %d, "
1294                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300   struct data_reference *dr;
1301   int npeel;
1302   unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307   struct _vect_peel_info peel_info;
1308   unsigned int inside_cost;
1309   unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers.  */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317   static inline hashval_t hash (const _vect_peel_info *);
1318   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324   return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330   return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR into peeling hash table with NPEEL as key.  */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338                           loop_vec_info loop_vinfo, struct data_reference *dr,
1339                           int npeel)
1340 {
1341   struct _vect_peel_info elem, *slot;
1342   _vect_peel_info **new_slot;
1343   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1344
1345   elem.npeel = npeel;
1346   slot = peeling_htab->find (&elem);
1347   if (slot)
1348     slot->count++;
1349   else
1350     {
1351       slot = XNEW (struct _vect_peel_info);
1352       slot->npeel = npeel;
1353       slot->dr = dr;
1354       slot->count = 1;
1355       new_slot = peeling_htab->find_slot (slot, INSERT);
1356       *new_slot = slot;
1357     }
1358
1359   if (!supportable_dr_alignment
1360       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1361     slot->count += VECT_MAX_COST;
1362 }
1363
1364
1365 /* Traverse peeling hash table to find peeling option that aligns maximum
1366    number of data accesses.  */
1367
1368 int
1369 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1370                                      _vect_peel_extended_info *max)
1371 {
1372   vect_peel_info elem = *slot;
1373
1374   if (elem->count > max->peel_info.count
1375       || (elem->count == max->peel_info.count
1376           && max->peel_info.npeel > elem->npeel))
1377     {
1378       max->peel_info.npeel = elem->npeel;
1379       max->peel_info.count = elem->count;
1380       max->peel_info.dr = elem->dr;
1381     }
1382
1383   return 1;
1384 }
1385
1386 /* Get the costs of peeling NPEEL iterations checking data access costs
1387    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1388    misalignment will be zero after peeling.  */
1389
1390 static void
1391 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1392                                 struct data_reference *dr0,
1393                                 unsigned int *inside_cost,
1394                                 unsigned int *outside_cost,
1395                                 stmt_vector_for_cost *body_cost_vec,
1396                                 stmt_vector_for_cost *prologue_cost_vec,
1397                                 unsigned int npeel,
1398                                 bool unknown_misalignment)
1399 {
1400   unsigned i;
1401   data_reference *dr;
1402
1403   FOR_EACH_VEC_ELT (datarefs, i, dr)
1404     {
1405       gimple *stmt = vect_dr_stmt (dr);
1406       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1407       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1408         continue;
1409
1410       /* For interleaving, only the alignment of the first access
1411          matters.  */
1412       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1413           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1414         continue;
1415
1416       /* Strided accesses perform only component accesses, alignment is
1417          irrelevant for them.  */
1418       if (STMT_VINFO_STRIDED_P (stmt_info)
1419           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1420         continue;
1421
1422       int save_misalignment;
1423       save_misalignment = DR_MISALIGNMENT (dr);
1424       if (npeel == 0)
1425         ;
1426       else if (unknown_misalignment && dr == dr0)
1427         SET_DR_MISALIGNMENT (dr, 0);
1428       else
1429         vect_update_misalignment_for_peel (dr, dr0, npeel);
1430       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1431                                  body_cost_vec, prologue_cost_vec);
1432       SET_DR_MISALIGNMENT (dr, save_misalignment);
1433     }
1434 }
1435
1436 /* Traverse peeling hash table and calculate cost for each peeling option.
1437    Find the one with the lowest cost.  */
1438
1439 int
1440 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1441                                    _vect_peel_extended_info *min)
1442 {
1443   vect_peel_info elem = *slot;
1444   int dummy;
1445   unsigned int inside_cost = 0, outside_cost = 0;
1446   gimple *stmt = vect_dr_stmt (elem->dr);
1447   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1448   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1449   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1450                        epilogue_cost_vec;
1451
1452   prologue_cost_vec.create (2);
1453   body_cost_vec.create (2);
1454   epilogue_cost_vec.create (2);
1455
1456   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1457                                   elem->dr, &inside_cost, &outside_cost,
1458                                   &body_cost_vec, &prologue_cost_vec,
1459                                   elem->npeel, false);
1460
1461   body_cost_vec.release ();
1462
1463   outside_cost += vect_get_known_peeling_cost
1464     (loop_vinfo, elem->npeel, &dummy,
1465      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466      &prologue_cost_vec, &epilogue_cost_vec);
1467
1468   /* Prologue and epilogue costs are added to the target model later.
1469      These costs depend only on the scalar iteration cost, the
1470      number of peeling iterations finally chosen, and the number of
1471      misaligned statements.  So discard the information found here.  */
1472   prologue_cost_vec.release ();
1473   epilogue_cost_vec.release ();
1474
1475   if (inside_cost < min->inside_cost
1476       || (inside_cost == min->inside_cost
1477           && outside_cost < min->outside_cost))
1478     {
1479       min->inside_cost = inside_cost;
1480       min->outside_cost = outside_cost;
1481       min->peel_info.dr = elem->dr;
1482       min->peel_info.npeel = elem->npeel;
1483       min->peel_info.count = elem->count;
1484     }
1485
1486   return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491    choosing an option with the lowest cost (if cost model is enabled) or the
1492    option that aligns as many accesses as possible.  */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496                                        loop_vec_info loop_vinfo)
1497 {
1498    struct _vect_peel_extended_info res;
1499
1500    res.peel_info.dr = NULL;
1501
1502    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503      {
1504        res.inside_cost = INT_MAX;
1505        res.outside_cost = INT_MAX;
1506        peeling_htab->traverse <_vect_peel_extended_info *,
1507                                vect_peeling_hash_get_lowest_cost> (&res);
1508      }
1509    else
1510      {
1511        res.peel_info.count = 0;
1512        peeling_htab->traverse <_vect_peel_extended_info *,
1513                                vect_peeling_hash_get_most_frequent> (&res);
1514        res.inside_cost = 0;
1515        res.outside_cost = 0;
1516      }
1517
1518    return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported.  */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1525                           unsigned npeel)
1526 {
1527   unsigned i;
1528   struct data_reference *dr = NULL;
1529   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530   gimple *stmt;
1531   stmt_vec_info stmt_info;
1532   enum dr_alignment_support supportable_dr_alignment;
1533
1534   /* Ensure that all data refs can be vectorized after the peel.  */
1535   FOR_EACH_VEC_ELT (datarefs, i, dr)
1536     {
1537       int save_misalignment;
1538
1539       if (dr == dr0)
1540         continue;
1541
1542       stmt = vect_dr_stmt (dr);
1543       stmt_info = vinfo_for_stmt (stmt);
1544       /* For interleaving, only the alignment of the first access
1545          matters.  */
1546       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1547           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1548         continue;
1549
1550       /* Strided accesses perform only component accesses, alignment is
1551          irrelevant for them.  */
1552       if (STMT_VINFO_STRIDED_P (stmt_info)
1553           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1554         continue;
1555
1556       save_misalignment = DR_MISALIGNMENT (dr);
1557       vect_update_misalignment_for_peel (dr, dr0, npeel);
1558       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1559       SET_DR_MISALIGNMENT (dr, save_misalignment);
1560
1561       if (!supportable_dr_alignment)
1562         return false;
1563     }
1564
1565   return true;
1566 }
1567
1568 /* Function vect_enhance_data_refs_alignment
1569
1570    This pass will use loop versioning and loop peeling in order to enhance
1571    the alignment of data references in the loop.
1572
1573    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1574    original loop is to be vectorized.  Any other loops that are created by
1575    the transformations performed in this pass - are not supposed to be
1576    vectorized.  This restriction will be relaxed.
1577
1578    This pass will require a cost model to guide it whether to apply peeling
1579    or versioning or a combination of the two.  For example, the scheme that
1580    intel uses when given a loop with several memory accesses, is as follows:
1581    choose one memory access ('p') which alignment you want to force by doing
1582    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1583    other accesses are not necessarily aligned, or (2) use loop versioning to
1584    generate one loop in which all accesses are aligned, and another loop in
1585    which only 'p' is necessarily aligned.
1586
1587    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1588    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1589    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1590
1591    Devising a cost model is the most critical aspect of this work.  It will
1592    guide us on which access to peel for, whether to use loop versioning, how
1593    many versions to create, etc.  The cost model will probably consist of
1594    generic considerations as well as target specific considerations (on
1595    powerpc for example, misaligned stores are more painful than misaligned
1596    loads).
1597
1598    Here are the general steps involved in alignment enhancements:
1599
1600      -- original loop, before alignment analysis:
1601         for (i=0; i<N; i++){
1602           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1603           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1604         }
1605
1606      -- After vect_compute_data_refs_alignment:
1607         for (i=0; i<N; i++){
1608           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1609           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1610         }
1611
1612      -- Possibility 1: we do loop versioning:
1613      if (p is aligned) {
1614         for (i=0; i<N; i++){    # loop 1A
1615           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1616           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1617         }
1618      }
1619      else {
1620         for (i=0; i<N; i++){    # loop 1B
1621           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1622           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1623         }
1624      }
1625
1626      -- Possibility 2: we do loop peeling:
1627      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1628         x = q[i];
1629         p[i] = y;
1630      }
1631      for (i = 3; i < N; i++){   # loop 2A
1632         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1633         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1634      }
1635
1636      -- Possibility 3: combination of loop peeling and versioning:
1637      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1638         x = q[i];
1639         p[i] = y;
1640      }
1641      if (p is aligned) {
1642         for (i = 3; i<N; i++){  # loop 3A
1643           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1644           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1645         }
1646      }
1647      else {
1648         for (i = 3; i<N; i++){  # loop 3B
1649           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1650           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1651         }
1652      }
1653
1654      These loops are later passed to loop_transform to be vectorized.  The
1655      vectorizer will use the alignment information to guide the transformation
1656      (whether to generate regular loads/stores, or with special handling for
1657      misalignment).  */
1658
1659 bool
1660 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1661 {
1662   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1663   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1664   enum dr_alignment_support supportable_dr_alignment;
1665   struct data_reference *dr0 = NULL, *first_store = NULL;
1666   struct data_reference *dr;
1667   unsigned int i, j;
1668   bool do_peeling = false;
1669   bool do_versioning = false;
1670   bool stat;
1671   gimple *stmt;
1672   stmt_vec_info stmt_info;
1673   unsigned int npeel = 0;
1674   bool one_misalignment_known = false;
1675   bool one_misalignment_unknown = false;
1676   bool one_dr_unsupportable = false;
1677   struct data_reference *unsupportable_dr = NULL;
1678   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1679   unsigned possible_npeel_number = 1;
1680   tree vectype;
1681   unsigned int mis, same_align_drs_max = 0;
1682   hash_table<peel_info_hasher> peeling_htab (1);
1683
1684   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1685
1686   /* Reset data so we can safely be called multiple times.  */
1687   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1688   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1689
1690   /* While cost model enhancements are expected in the future, the high level
1691      view of the code at this time is as follows:
1692
1693      A) If there is a misaligned access then see if peeling to align
1694         this access can make all data references satisfy
1695         vect_supportable_dr_alignment.  If so, update data structures
1696         as needed and return true.
1697
1698      B) If peeling wasn't possible and there is a data reference with an
1699         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1700         then see if loop versioning checks can be used to make all data
1701         references satisfy vect_supportable_dr_alignment.  If so, update
1702         data structures as needed and return true.
1703
1704      C) If neither peeling nor versioning were successful then return false if
1705         any data reference does not satisfy vect_supportable_dr_alignment.
1706
1707      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1708
1709      Note, Possibility 3 above (which is peeling and versioning together) is not
1710      being done at this time.  */
1711
1712   /* (1) Peeling to force alignment.  */
1713
1714   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1715      Considerations:
1716      + How many accesses will become aligned due to the peeling
1717      - How many accesses will become unaligned due to the peeling,
1718        and the cost of misaligned accesses.
1719      - The cost of peeling (the extra runtime checks, the increase
1720        in code size).  */
1721
1722   FOR_EACH_VEC_ELT (datarefs, i, dr)
1723     {
1724       stmt = vect_dr_stmt (dr);
1725       stmt_info = vinfo_for_stmt (stmt);
1726
1727       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1728         continue;
1729
1730       /* For interleaving, only the alignment of the first access
1731          matters.  */
1732       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1733           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1734         continue;
1735
1736       /* For invariant accesses there is nothing to enhance.  */
1737       if (integer_zerop (DR_STEP (dr)))
1738         continue;
1739
1740       /* Strided accesses perform only component accesses, alignment is
1741          irrelevant for them.  */
1742       if (STMT_VINFO_STRIDED_P (stmt_info)
1743           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1744         continue;
1745
1746       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1747       do_peeling = vector_alignment_reachable_p (dr);
1748       if (do_peeling)
1749         {
1750           if (known_alignment_for_access_p (dr))
1751             {
1752               unsigned int npeel_tmp = 0;
1753               bool negative = tree_int_cst_compare (DR_STEP (dr),
1754                                                     size_zero_node) < 0;
1755
1756               vectype = STMT_VINFO_VECTYPE (stmt_info);
1757               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1758               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1759               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1760               if (DR_MISALIGNMENT (dr) != 0)
1761                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1762
1763               /* For multiple types, it is possible that the bigger type access
1764                  will have more than one peeling option.  E.g., a loop with two
1765                  types: one of size (vector size / 4), and the other one of
1766                  size (vector size / 8).  Vectorization factor will 8.  If both
1767                  accesses are misaligned by 3, the first one needs one scalar
1768                  iteration to be aligned, and the second one needs 5.  But the
1769                  first one will be aligned also by peeling 5 scalar
1770                  iterations, and in that case both accesses will be aligned.
1771                  Hence, except for the immediate peeling amount, we also want
1772                  to try to add full vector size, while we don't exceed
1773                  vectorization factor.
1774                  We do this automatically for cost model, since we calculate
1775                  cost for every peeling option.  */
1776               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1777                 {
1778                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1779                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1780                   possible_npeel_number
1781                     = vect_get_num_vectors (nscalars, vectype);
1782
1783                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1784                      allow peeling NELEMENTS.  */
1785                   if (DR_MISALIGNMENT (dr) == 0)
1786                     possible_npeel_number++;
1787                 }
1788
1789               /* Save info about DR in the hash table.  Also include peeling
1790                  amounts according to the explanation above.  */
1791               for (j = 0; j < possible_npeel_number; j++)
1792                 {
1793                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1794                                             dr, npeel_tmp);
1795                   npeel_tmp += target_align / dr_size;
1796                 }
1797
1798               one_misalignment_known = true;
1799             }
1800           else
1801             {
1802               /* If we don't know any misalignment values, we prefer
1803                  peeling for data-ref that has the maximum number of data-refs
1804                  with the same alignment, unless the target prefers to align
1805                  stores over load.  */
1806               unsigned same_align_drs
1807                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1808               if (!dr0
1809                   || same_align_drs_max < same_align_drs)
1810                 {
1811                   same_align_drs_max = same_align_drs;
1812                   dr0 = dr;
1813                 }
1814               /* For data-refs with the same number of related
1815                  accesses prefer the one where the misalign
1816                  computation will be invariant in the outermost loop.  */
1817               else if (same_align_drs_max == same_align_drs)
1818                 {
1819                   struct loop *ivloop0, *ivloop;
1820                   ivloop0 = outermost_invariant_loop_for_expr
1821                     (loop, DR_BASE_ADDRESS (dr0));
1822                   ivloop = outermost_invariant_loop_for_expr
1823                     (loop, DR_BASE_ADDRESS (dr));
1824                   if ((ivloop && !ivloop0)
1825                       || (ivloop && ivloop0
1826                           && flow_loop_nested_p (ivloop, ivloop0)))
1827                     dr0 = dr;
1828                 }
1829
1830               one_misalignment_unknown = true;
1831
1832               /* Check for data refs with unsupportable alignment that
1833                  can be peeled.  */
1834               if (!supportable_dr_alignment)
1835               {
1836                 one_dr_unsupportable = true;
1837                 unsupportable_dr = dr;
1838               }
1839
1840               if (!first_store && DR_IS_WRITE (dr))
1841                 first_store = dr;
1842             }
1843         }
1844       else
1845         {
1846           if (!aligned_access_p (dr))
1847             {
1848               if (dump_enabled_p ())
1849                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1850                                  "vector alignment may not be reachable\n");
1851               break;
1852             }
1853         }
1854     }
1855
1856   /* Check if we can possibly peel the loop.  */
1857   if (!vect_can_advance_ivs_p (loop_vinfo)
1858       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1859       || loop->inner)
1860     do_peeling = false;
1861
1862   struct _vect_peel_extended_info peel_for_known_alignment;
1863   struct _vect_peel_extended_info peel_for_unknown_alignment;
1864   struct _vect_peel_extended_info best_peel;
1865
1866   peel_for_unknown_alignment.inside_cost = INT_MAX;
1867   peel_for_unknown_alignment.outside_cost = INT_MAX;
1868   peel_for_unknown_alignment.peel_info.count = 0;
1869
1870   if (do_peeling
1871       && one_misalignment_unknown)
1872     {
1873       /* Check if the target requires to prefer stores over loads, i.e., if
1874          misaligned stores are more expensive than misaligned loads (taking
1875          drs with same alignment into account).  */
1876       unsigned int load_inside_cost = 0;
1877       unsigned int load_outside_cost = 0;
1878       unsigned int store_inside_cost = 0;
1879       unsigned int store_outside_cost = 0;
1880       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1881
1882       stmt_vector_for_cost dummy;
1883       dummy.create (2);
1884       vect_get_peeling_costs_all_drs (datarefs, dr0,
1885                                       &load_inside_cost,
1886                                       &load_outside_cost,
1887                                       &dummy, &dummy, estimated_npeels, true);
1888       dummy.release ();
1889
1890       if (first_store)
1891         {
1892           dummy.create (2);
1893           vect_get_peeling_costs_all_drs (datarefs, first_store,
1894                                           &store_inside_cost,
1895                                           &store_outside_cost,
1896                                           &dummy, &dummy,
1897                                           estimated_npeels, true);
1898           dummy.release ();
1899         }
1900       else
1901         {
1902           store_inside_cost = INT_MAX;
1903           store_outside_cost = INT_MAX;
1904         }
1905
1906       if (load_inside_cost > store_inside_cost
1907           || (load_inside_cost == store_inside_cost
1908               && load_outside_cost > store_outside_cost))
1909         {
1910           dr0 = first_store;
1911           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1912           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1913         }
1914       else
1915         {
1916           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1917           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1918         }
1919
1920       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1921       prologue_cost_vec.create (2);
1922       epilogue_cost_vec.create (2);
1923
1924       int dummy2;
1925       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1926         (loop_vinfo, estimated_npeels, &dummy2,
1927          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1928          &prologue_cost_vec, &epilogue_cost_vec);
1929
1930       prologue_cost_vec.release ();
1931       epilogue_cost_vec.release ();
1932
1933       peel_for_unknown_alignment.peel_info.count = 1
1934         + STMT_VINFO_SAME_ALIGN_REFS
1935         (vinfo_for_stmt (vect_dr_stmt (dr0))).length ();
1936     }
1937
1938   peel_for_unknown_alignment.peel_info.npeel = 0;
1939   peel_for_unknown_alignment.peel_info.dr = dr0;
1940
1941   best_peel = peel_for_unknown_alignment;
1942
1943   peel_for_known_alignment.inside_cost = INT_MAX;
1944   peel_for_known_alignment.outside_cost = INT_MAX;
1945   peel_for_known_alignment.peel_info.count = 0;
1946   peel_for_known_alignment.peel_info.dr = NULL;
1947
1948   if (do_peeling && one_misalignment_known)
1949     {
1950       /* Peeling is possible, but there is no data access that is not supported
1951          unless aligned.  So we try to choose the best possible peeling from
1952          the hash table.  */
1953       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1954         (&peeling_htab, loop_vinfo);
1955     }
1956
1957   /* Compare costs of peeling for known and unknown alignment. */
1958   if (peel_for_known_alignment.peel_info.dr != NULL
1959       && peel_for_unknown_alignment.inside_cost
1960       >= peel_for_known_alignment.inside_cost)
1961     {
1962       best_peel = peel_for_known_alignment;
1963
1964       /* If the best peeling for known alignment has NPEEL == 0, perform no
1965          peeling at all except if there is an unsupportable dr that we can
1966          align.  */
1967       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1968         do_peeling = false;
1969     }
1970
1971   /* If there is an unsupportable data ref, prefer this over all choices so far
1972      since we'd have to discard a chosen peeling except when it accidentally
1973      aligned the unsupportable data ref.  */
1974   if (one_dr_unsupportable)
1975     dr0 = unsupportable_dr;
1976   else if (do_peeling)
1977     {
1978       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1979          TODO: Use nopeel_outside_cost or get rid of it?  */
1980       unsigned nopeel_inside_cost = 0;
1981       unsigned nopeel_outside_cost = 0;
1982
1983       stmt_vector_for_cost dummy;
1984       dummy.create (2);
1985       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1986                                       &nopeel_outside_cost, &dummy, &dummy,
1987                                       0, false);
1988       dummy.release ();
1989
1990       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1991          costs will be recorded.  */
1992       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1993       prologue_cost_vec.create (2);
1994       epilogue_cost_vec.create (2);
1995
1996       int dummy2;
1997       nopeel_outside_cost += vect_get_known_peeling_cost
1998         (loop_vinfo, 0, &dummy2,
1999          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2000          &prologue_cost_vec, &epilogue_cost_vec);
2001
2002       prologue_cost_vec.release ();
2003       epilogue_cost_vec.release ();
2004
2005       npeel = best_peel.peel_info.npeel;
2006       dr0 = best_peel.peel_info.dr;
2007
2008       /* If no peeling is not more expensive than the best peeling we
2009          have so far, don't perform any peeling.  */
2010       if (nopeel_inside_cost <= best_peel.inside_cost)
2011         do_peeling = false;
2012     }
2013
2014   if (do_peeling)
2015     {
2016       stmt = vect_dr_stmt (dr0);
2017       stmt_info = vinfo_for_stmt (stmt);
2018       vectype = STMT_VINFO_VECTYPE (stmt_info);
2019
2020       if (known_alignment_for_access_p (dr0))
2021         {
2022           bool negative = tree_int_cst_compare (DR_STEP (dr0),
2023                                                 size_zero_node) < 0;
2024           if (!npeel)
2025             {
2026               /* Since it's known at compile time, compute the number of
2027                  iterations in the peeled loop (the peeling factor) for use in
2028                  updating DR_MISALIGNMENT values.  The peeling factor is the
2029                  vectorization factor minus the misalignment as an element
2030                  count.  */
2031               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
2032               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2033               npeel = ((mis & (target_align - 1))
2034                        / vect_get_scalar_dr_size (dr0));
2035             }
2036
2037           /* For interleaved data access every iteration accesses all the
2038              members of the group, therefore we divide the number of iterations
2039              by the group size.  */
2040           stmt_info = vinfo_for_stmt (vect_dr_stmt (dr0));
2041           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2042             npeel /= DR_GROUP_SIZE (stmt_info);
2043
2044           if (dump_enabled_p ())
2045             dump_printf_loc (MSG_NOTE, vect_location,
2046                              "Try peeling by %d\n", npeel);
2047         }
2048
2049       /* Ensure that all datarefs can be vectorized after the peel.  */
2050       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
2051         do_peeling = false;
2052
2053       /* Check if all datarefs are supportable and log.  */
2054       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
2055         {
2056           stat = vect_verify_datarefs_alignment (loop_vinfo);
2057           if (!stat)
2058             do_peeling = false;
2059           else
2060             return stat;
2061         }
2062
2063       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2064       if (do_peeling)
2065         {
2066           unsigned max_allowed_peel
2067             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2068           if (max_allowed_peel != (unsigned)-1)
2069             {
2070               unsigned max_peel = npeel;
2071               if (max_peel == 0)
2072                 {
2073                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2074                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2075                 }
2076               if (max_peel > max_allowed_peel)
2077                 {
2078                   do_peeling = false;
2079                   if (dump_enabled_p ())
2080                     dump_printf_loc (MSG_NOTE, vect_location,
2081                         "Disable peeling, max peels reached: %d\n", max_peel);
2082                 }
2083             }
2084         }
2085
2086       /* Cost model #2 - if peeling may result in a remaining loop not
2087          iterating enough to be vectorized then do not peel.  Since this
2088          is a cost heuristic rather than a correctness decision, use the
2089          most likely runtime value for variable vectorization factors.  */
2090       if (do_peeling
2091           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2092         {
2093           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2094           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2095           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2096               < assumed_vf + max_peel)
2097             do_peeling = false;
2098         }
2099
2100       if (do_peeling)
2101         {
2102           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2103              If the misalignment of DR_i is identical to that of dr0 then set
2104              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2105              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2106              by the peeling factor times the element size of DR_i (MOD the
2107              vectorization factor times the size).  Otherwise, the
2108              misalignment of DR_i must be set to unknown.  */
2109           FOR_EACH_VEC_ELT (datarefs, i, dr)
2110             if (dr != dr0)
2111               {
2112                 /* Strided accesses perform only component accesses, alignment
2113                    is irrelevant for them.  */
2114                 stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2115                 if (STMT_VINFO_STRIDED_P (stmt_info)
2116                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117                   continue;
2118
2119                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2120               }
2121
2122           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2123           if (npeel)
2124             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2125           else
2126             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2127               = DR_MISALIGNMENT (dr0);
2128           SET_DR_MISALIGNMENT (dr0, 0);
2129           if (dump_enabled_p ())
2130             {
2131               dump_printf_loc (MSG_NOTE, vect_location,
2132                                "Alignment of access forced using peeling.\n");
2133               dump_printf_loc (MSG_NOTE, vect_location,
2134                                "Peeling for alignment will be applied.\n");
2135             }
2136
2137           /* The inside-loop cost will be accounted for in vectorizable_load
2138              and vectorizable_store correctly with adjusted alignments.
2139              Drop the body_cst_vec on the floor here.  */
2140           stat = vect_verify_datarefs_alignment (loop_vinfo);
2141           gcc_assert (stat);
2142           return stat;
2143         }
2144     }
2145
2146   /* (2) Versioning to force alignment.  */
2147
2148   /* Try versioning if:
2149      1) optimize loop for speed
2150      2) there is at least one unsupported misaligned data ref with an unknown
2151         misalignment, and
2152      3) all misaligned data refs with a known misalignment are supported, and
2153      4) the number of runtime alignment checks is within reason.  */
2154
2155   do_versioning =
2156         optimize_loop_nest_for_speed_p (loop)
2157         && (!loop->inner); /* FORNOW */
2158
2159   if (do_versioning)
2160     {
2161       FOR_EACH_VEC_ELT (datarefs, i, dr)
2162         {
2163           stmt = vect_dr_stmt (dr);
2164           stmt_info = vinfo_for_stmt (stmt);
2165
2166           /* For interleaving, only the alignment of the first access
2167              matters.  */
2168           if (aligned_access_p (dr)
2169               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2170                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2171             continue;
2172
2173           if (STMT_VINFO_STRIDED_P (stmt_info))
2174             {
2175               /* Strided loads perform only component accesses, alignment is
2176                  irrelevant for them.  */
2177               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2178                 continue;
2179               do_versioning = false;
2180               break;
2181             }
2182
2183           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2184
2185           if (!supportable_dr_alignment)
2186             {
2187               gimple *stmt;
2188               int mask;
2189               tree vectype;
2190
2191               if (known_alignment_for_access_p (dr)
2192                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2193                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2194                 {
2195                   do_versioning = false;
2196                   break;
2197                 }
2198
2199               stmt = vect_dr_stmt (dr);
2200               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2201               gcc_assert (vectype);
2202
2203               /* At present we don't support versioning for alignment
2204                  with variable VF, since there's no guarantee that the
2205                  VF is a power of two.  We could relax this if we added
2206                  a way of enforcing a power-of-two size.  */
2207               unsigned HOST_WIDE_INT size;
2208               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2209                 {
2210                   do_versioning = false;
2211                   break;
2212                 }
2213
2214               /* The rightmost bits of an aligned address must be zeros.
2215                  Construct the mask needed for this test.  For example,
2216                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2217                  mask must be 15 = 0xf. */
2218               mask = size - 1;
2219
2220               /* FORNOW: use the same mask to test all potentially unaligned
2221                  references in the loop.  The vectorizer currently supports
2222                  a single vector size, see the reference to
2223                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2224                  vectorization factor is computed.  */
2225               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2226                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2227               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2228               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2229                       vect_dr_stmt (dr));
2230             }
2231         }
2232
2233       /* Versioning requires at least one misaligned data reference.  */
2234       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2235         do_versioning = false;
2236       else if (!do_versioning)
2237         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2238     }
2239
2240   if (do_versioning)
2241     {
2242       vec<gimple *> may_misalign_stmts
2243         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2244       gimple *stmt;
2245
2246       /* It can now be assumed that the data references in the statements
2247          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2248          of the loop being vectorized.  */
2249       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2250         {
2251           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2252           dr = STMT_VINFO_DATA_REF (stmt_info);
2253           SET_DR_MISALIGNMENT (dr, 0);
2254           if (dump_enabled_p ())
2255             dump_printf_loc (MSG_NOTE, vect_location,
2256                              "Alignment of access forced using versioning.\n");
2257         }
2258
2259       if (dump_enabled_p ())
2260         dump_printf_loc (MSG_NOTE, vect_location,
2261                          "Versioning for alignment will be applied.\n");
2262
2263       /* Peeling and versioning can't be done together at this time.  */
2264       gcc_assert (! (do_peeling && do_versioning));
2265
2266       stat = vect_verify_datarefs_alignment (loop_vinfo);
2267       gcc_assert (stat);
2268       return stat;
2269     }
2270
2271   /* This point is reached if neither peeling nor versioning is being done.  */
2272   gcc_assert (! (do_peeling || do_versioning));
2273
2274   stat = vect_verify_datarefs_alignment (loop_vinfo);
2275   return stat;
2276 }
2277
2278
2279 /* Function vect_find_same_alignment_drs.
2280
2281    Update group and alignment relations according to the chosen
2282    vectorization factor.  */
2283
2284 static void
2285 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2286 {
2287   struct data_reference *dra = DDR_A (ddr);
2288   struct data_reference *drb = DDR_B (ddr);
2289   stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2290   stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2291
2292   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2293     return;
2294
2295   if (dra == drb)
2296     return;
2297
2298   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2299       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2300       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2301     return;
2302
2303   /* Two references with distance zero have the same alignment.  */
2304   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2305                           - wi::to_poly_offset (DR_INIT (drb)));
2306   if (maybe_ne (diff, 0))
2307     {
2308       /* Get the wider of the two alignments.  */
2309       unsigned int align_a = (vect_calculate_target_alignment (dra)
2310                               / BITS_PER_UNIT);
2311       unsigned int align_b = (vect_calculate_target_alignment (drb)
2312                               / BITS_PER_UNIT);
2313       unsigned int max_align = MAX (align_a, align_b);
2314
2315       /* Require the gap to be a multiple of the larger vector alignment.  */
2316       if (!multiple_p (diff, max_align))
2317         return;
2318     }
2319
2320   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2321   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2322   if (dump_enabled_p ())
2323     {
2324       dump_printf_loc (MSG_NOTE, vect_location,
2325                        "accesses have the same alignment: ");
2326       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2327       dump_printf (MSG_NOTE,  " and ");
2328       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2329       dump_printf (MSG_NOTE, "\n");
2330     }
2331 }
2332
2333
2334 /* Function vect_analyze_data_refs_alignment
2335
2336    Analyze the alignment of the data-references in the loop.
2337    Return FALSE if a data reference is found that cannot be vectorized.  */
2338
2339 bool
2340 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2341 {
2342   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2343
2344   /* Mark groups of data references with same alignment using
2345      data dependence information.  */
2346   vec<ddr_p> ddrs = vinfo->ddrs;
2347   struct data_dependence_relation *ddr;
2348   unsigned int i;
2349
2350   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2351     vect_find_same_alignment_drs (ddr);
2352
2353   vec<data_reference_p> datarefs = vinfo->datarefs;
2354   struct data_reference *dr;
2355
2356   vect_record_base_alignments (vinfo);
2357   FOR_EACH_VEC_ELT (datarefs, i, dr)
2358     {
2359       stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
2360       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2361           && !vect_compute_data_ref_alignment (dr))
2362         {
2363           /* Strided accesses perform only component accesses, misalignment
2364              information is irrelevant for them.  */
2365           if (STMT_VINFO_STRIDED_P (stmt_info)
2366               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2367             continue;
2368
2369           if (dump_enabled_p ())
2370             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2371                              "not vectorized: can't calculate alignment "
2372                              "for data ref.\n");
2373
2374           return false;
2375         }
2376     }
2377
2378   return true;
2379 }
2380
2381
2382 /* Analyze alignment of DRs of stmts in NODE.  */
2383
2384 static bool
2385 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2386 {
2387   /* We vectorize from the first scalar stmt in the node unless
2388      the node is permuted in which case we start from the first
2389      element in the group.  */
2390   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2391   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2392   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2393     first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2394
2395   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2396   if (! vect_compute_data_ref_alignment (dr)
2397       /* For creating the data-ref pointer we need alignment of the
2398          first element anyway.  */
2399       || (dr != first_dr
2400           && ! vect_compute_data_ref_alignment (first_dr))
2401       || ! verify_data_ref_alignment (dr))
2402     {
2403       if (dump_enabled_p ())
2404         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405                          "not vectorized: bad data alignment in basic "
2406                          "block.\n");
2407       return false;
2408     }
2409
2410   return true;
2411 }
2412
2413 /* Function vect_slp_analyze_instance_alignment
2414
2415    Analyze the alignment of the data-references in the SLP instance.
2416    Return FALSE if a data reference is found that cannot be vectorized.  */
2417
2418 bool
2419 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2420 {
2421   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2422
2423   slp_tree node;
2424   unsigned i;
2425   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2426     if (! vect_slp_analyze_and_verify_node_alignment (node))
2427       return false;
2428
2429   node = SLP_INSTANCE_TREE (instance);
2430   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2431       && ! vect_slp_analyze_and_verify_node_alignment
2432              (SLP_INSTANCE_TREE (instance)))
2433     return false;
2434
2435   return true;
2436 }
2437
2438
2439 /* Analyze groups of accesses: check that DR belongs to a group of
2440    accesses of legal size, step, etc.  Detect gaps, single element
2441    interleaving, and other special cases. Set grouped access info.
2442    Collect groups of strided stores for further use in SLP analysis.
2443    Worker for vect_analyze_group_access.  */
2444
2445 static bool
2446 vect_analyze_group_access_1 (struct data_reference *dr)
2447 {
2448   tree step = DR_STEP (dr);
2449   tree scalar_type = TREE_TYPE (DR_REF (dr));
2450   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2451   gimple *stmt = vect_dr_stmt (dr);
2452   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2453   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2454   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2455   HOST_WIDE_INT dr_step = -1;
2456   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2457   bool slp_impossible = false;
2458
2459   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2460      size of the interleaving group (including gaps).  */
2461   if (tree_fits_shwi_p (step))
2462     {
2463       dr_step = tree_to_shwi (step);
2464       /* Check that STEP is a multiple of type size.  Otherwise there is
2465          a non-element-sized gap at the end of the group which we
2466          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2467          ???  As we can handle non-constant step fine here we should
2468          simply remove uses of DR_GROUP_GAP between the last and first
2469          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2470          simply not include that gap.  */
2471       if ((dr_step % type_size) != 0)
2472         {
2473           if (dump_enabled_p ())
2474             {
2475               dump_printf_loc (MSG_NOTE, vect_location,
2476                                "Step ");
2477               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2478               dump_printf (MSG_NOTE,
2479                            " is not a multiple of the element size for ");
2480               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2481               dump_printf (MSG_NOTE, "\n");
2482             }
2483           return false;
2484         }
2485       groupsize = absu_hwi (dr_step) / type_size;
2486     }
2487   else
2488     groupsize = 0;
2489
2490   /* Not consecutive access is possible only if it is a part of interleaving.  */
2491   if (!DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2492     {
2493       /* Check if it this DR is a part of interleaving, and is a single
2494          element of the group that is accessed in the loop.  */
2495
2496       /* Gaps are supported only for loads. STEP must be a multiple of the type
2497          size.  */
2498       if (DR_IS_READ (dr)
2499           && (dr_step % type_size) == 0
2500           && groupsize > 0)
2501         {
2502           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2503           DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2504           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2505           if (dump_enabled_p ())
2506             {
2507               dump_printf_loc (MSG_NOTE, vect_location,
2508                                "Detected single element interleaving ");
2509               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2510               dump_printf (MSG_NOTE, " step ");
2511               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2512               dump_printf (MSG_NOTE, "\n");
2513             }
2514
2515           return true;
2516         }
2517
2518       if (dump_enabled_p ())
2519         {
2520           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2521                            "not consecutive access ");
2522           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2523         }
2524
2525       if (bb_vinfo)
2526         {
2527           /* Mark the statement as unvectorizable.  */
2528           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
2529           return true;
2530         }
2531
2532       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2533       STMT_VINFO_STRIDED_P (stmt_info) = true;
2534       return true;
2535     }
2536
2537   if (DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2538     {
2539       /* First stmt in the interleaving chain. Check the chain.  */
2540       gimple *next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2541       struct data_reference *data_ref = dr;
2542       unsigned int count = 1;
2543       tree prev_init = DR_INIT (data_ref);
2544       gimple *prev = stmt;
2545       HOST_WIDE_INT diff, gaps = 0;
2546
2547       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2548       while (next)
2549         {
2550           /* Skip same data-refs.  In case that two or more stmts share
2551              data-ref (supported only for loads), we vectorize only the first
2552              stmt, and the rest get their vectorized loads from the first
2553              one.  */
2554           if (!tree_int_cst_compare (DR_INIT (data_ref),
2555                                      DR_INIT (STMT_VINFO_DATA_REF (
2556                                                    vinfo_for_stmt (next)))))
2557             {
2558               if (DR_IS_WRITE (data_ref))
2559                 {
2560                   if (dump_enabled_p ())
2561                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2562                                      "Two store stmts share the same dr.\n");
2563                   return false;
2564                 }
2565
2566               if (dump_enabled_p ())
2567                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2568                                  "Two or more load stmts share the same dr.\n");
2569
2570               /* For load use the same data-ref load.  */
2571               DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2572
2573               prev = next;
2574               next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2575               continue;
2576             }
2577
2578           prev = next;
2579           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2580
2581           /* All group members have the same STEP by construction.  */
2582           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2583
2584           /* Check that the distance between two accesses is equal to the type
2585              size. Otherwise, we have gaps.  */
2586           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2587                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2588           if (diff != 1)
2589             {
2590               /* FORNOW: SLP of accesses with gaps is not supported.  */
2591               slp_impossible = true;
2592               if (DR_IS_WRITE (data_ref))
2593                 {
2594                   if (dump_enabled_p ())
2595                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2596                                      "interleaved store with gaps\n");
2597                   return false;
2598                 }
2599
2600               gaps += diff - 1;
2601             }
2602
2603           last_accessed_element += diff;
2604
2605           /* Store the gap from the previous member of the group. If there is no
2606              gap in the access, DR_GROUP_GAP is always 1.  */
2607           DR_GROUP_GAP (vinfo_for_stmt (next)) = diff;
2608
2609           prev_init = DR_INIT (data_ref);
2610           next = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2611           /* Count the number of data-refs in the chain.  */
2612           count++;
2613         }
2614
2615       if (groupsize == 0)
2616         groupsize = count + gaps;
2617
2618       /* This could be UINT_MAX but as we are generating code in a very
2619          inefficient way we have to cap earlier.  See PR78699 for example.  */
2620       if (groupsize > 4096)
2621         {
2622           if (dump_enabled_p ())
2623             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624                              "group is too large\n");
2625           return false;
2626         }
2627
2628       /* Check that the size of the interleaving is equal to count for stores,
2629          i.e., that there are no gaps.  */
2630       if (groupsize != count
2631           && !DR_IS_READ (dr))
2632         {
2633           if (dump_enabled_p ())
2634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2635                              "interleaved store with gaps\n");
2636           return false;
2637         }
2638
2639       /* If there is a gap after the last load in the group it is the
2640          difference between the groupsize and the last accessed
2641          element.
2642          When there is no gap, this difference should be 0.  */
2643       DR_GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2644
2645       DR_GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2646       if (dump_enabled_p ())
2647         {
2648           dump_printf_loc (MSG_NOTE, vect_location,
2649                            "Detected interleaving ");
2650           if (DR_IS_READ (dr))
2651             dump_printf (MSG_NOTE, "load ");
2652           else
2653             dump_printf (MSG_NOTE, "store ");
2654           dump_printf (MSG_NOTE, "of size %u starting with ",
2655                        (unsigned)groupsize);
2656           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2657           if (DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2658             dump_printf_loc (MSG_NOTE, vect_location,
2659                              "There is a gap of %u elements after the group\n",
2660                              DR_GROUP_GAP (vinfo_for_stmt (stmt)));
2661         }
2662
2663       /* SLP: create an SLP data structure for every interleaving group of
2664          stores for further analysis in vect_analyse_slp.  */
2665       if (DR_IS_WRITE (dr) && !slp_impossible)
2666         {
2667           if (loop_vinfo)
2668             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2669           if (bb_vinfo)
2670             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2671         }
2672     }
2673
2674   return true;
2675 }
2676
2677 /* Analyze groups of accesses: check that DR belongs to a group of
2678    accesses of legal size, step, etc.  Detect gaps, single element
2679    interleaving, and other special cases. Set grouped access info.
2680    Collect groups of strided stores for further use in SLP analysis.  */
2681
2682 static bool
2683 vect_analyze_group_access (struct data_reference *dr)
2684 {
2685   if (!vect_analyze_group_access_1 (dr))
2686     {
2687       /* Dissolve the group if present.  */
2688       gimple *next;
2689       gimple *stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (vect_dr_stmt (dr)));
2690       while (stmt)
2691         {
2692           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2693           next = DR_GROUP_NEXT_ELEMENT (vinfo);
2694           DR_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2695           DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2696           stmt = next;
2697         }
2698       return false;
2699     }
2700   return true;
2701 }
2702
2703 /* Analyze the access pattern of the data-reference DR.
2704    In case of non-consecutive accesses call vect_analyze_group_access() to
2705    analyze groups of accesses.  */
2706
2707 static bool
2708 vect_analyze_data_ref_access (struct data_reference *dr)
2709 {
2710   tree step = DR_STEP (dr);
2711   tree scalar_type = TREE_TYPE (DR_REF (dr));
2712   gimple *stmt = vect_dr_stmt (dr);
2713   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2714   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2715   struct loop *loop = NULL;
2716
2717   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2718     return true;
2719
2720   if (loop_vinfo)
2721     loop = LOOP_VINFO_LOOP (loop_vinfo);
2722
2723   if (loop_vinfo && !step)
2724     {
2725       if (dump_enabled_p ())
2726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2727                          "bad data-ref access in loop\n");
2728       return false;
2729     }
2730
2731   /* Allow loads with zero step in inner-loop vectorization.  */
2732   if (loop_vinfo && integer_zerop (step))
2733     {
2734       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2735       if (!nested_in_vect_loop_p (loop, stmt))
2736         return DR_IS_READ (dr);
2737       /* Allow references with zero step for outer loops marked
2738          with pragma omp simd only - it guarantees absence of
2739          loop-carried dependencies between inner loop iterations.  */
2740       if (loop->safelen < 2)
2741         {
2742           if (dump_enabled_p ())
2743             dump_printf_loc (MSG_NOTE, vect_location,
2744                              "zero step in inner loop of nest\n");
2745           return false;
2746         }
2747     }
2748
2749   if (loop && nested_in_vect_loop_p (loop, stmt))
2750     {
2751       /* Interleaved accesses are not yet supported within outer-loop
2752         vectorization for references in the inner-loop.  */
2753       DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2754
2755       /* For the rest of the analysis we use the outer-loop step.  */
2756       step = STMT_VINFO_DR_STEP (stmt_info);
2757       if (integer_zerop (step))
2758         {
2759           if (dump_enabled_p ())
2760             dump_printf_loc (MSG_NOTE, vect_location,
2761                              "zero step in outer loop.\n");
2762           return DR_IS_READ (dr);
2763         }
2764     }
2765
2766   /* Consecutive?  */
2767   if (TREE_CODE (step) == INTEGER_CST)
2768     {
2769       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2770       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2771           || (dr_step < 0
2772               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2773         {
2774           /* Mark that it is not interleaving.  */
2775           DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2776           return true;
2777         }
2778     }
2779
2780   if (loop && nested_in_vect_loop_p (loop, stmt))
2781     {
2782       if (dump_enabled_p ())
2783         dump_printf_loc (MSG_NOTE, vect_location,
2784                          "grouped access in outer loop.\n");
2785       return false;
2786     }
2787
2788
2789   /* Assume this is a DR handled by non-constant strided load case.  */
2790   if (TREE_CODE (step) != INTEGER_CST)
2791     return (STMT_VINFO_STRIDED_P (stmt_info)
2792             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2793                 || vect_analyze_group_access (dr)));
2794
2795   /* Not consecutive access - check if it's a part of interleaving group.  */
2796   return vect_analyze_group_access (dr);
2797 }
2798
2799 /* Compare two data-references DRA and DRB to group them into chunks
2800    suitable for grouping.  */
2801
2802 static int
2803 dr_group_sort_cmp (const void *dra_, const void *drb_)
2804 {
2805   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2806   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2807   int cmp;
2808
2809   /* Stabilize sort.  */
2810   if (dra == drb)
2811     return 0;
2812
2813   /* DRs in different loops never belong to the same group.  */
2814   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2815   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2816   if (loopa != loopb)
2817     return loopa->num < loopb->num ? -1 : 1;
2818
2819   /* Ordering of DRs according to base.  */
2820   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2821                                DR_BASE_ADDRESS (drb));
2822   if (cmp != 0)
2823     return cmp;
2824
2825   /* And according to DR_OFFSET.  */
2826   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2827   if (cmp != 0)
2828     return cmp;
2829
2830   /* Put reads before writes.  */
2831   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2832     return DR_IS_READ (dra) ? -1 : 1;
2833
2834   /* Then sort after access size.  */
2835   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2836                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2837   if (cmp != 0)
2838     return cmp;
2839
2840   /* And after step.  */
2841   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2842   if (cmp != 0)
2843     return cmp;
2844
2845   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2846   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2847   if (cmp == 0)
2848     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2849   return cmp;
2850 }
2851
2852 /* If OP is the result of a conversion, return the unconverted value,
2853    otherwise return null.  */
2854
2855 static tree
2856 strip_conversion (tree op)
2857 {
2858   if (TREE_CODE (op) != SSA_NAME)
2859     return NULL_TREE;
2860   gimple *stmt = SSA_NAME_DEF_STMT (op);
2861   if (!is_gimple_assign (stmt)
2862       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2863     return NULL_TREE;
2864   return gimple_assign_rhs1 (stmt);
2865 }
2866
2867 /* Return true if vectorizable_* routines can handle statements STMT1
2868    and STMT2 being in a single group.  */
2869
2870 static bool
2871 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2872 {
2873   if (gimple_assign_single_p (stmt1))
2874     return gimple_assign_single_p (stmt2);
2875
2876   if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2877     {
2878       /* Check for two masked loads or two masked stores.  */
2879       if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2880         return false;
2881       internal_fn ifn = gimple_call_internal_fn (stmt1);
2882       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2883         return false;
2884       if (ifn != gimple_call_internal_fn (stmt2))
2885         return false;
2886
2887       /* Check that the masks are the same.  Cope with casts of masks,
2888          like those created by build_mask_conversion.  */
2889       tree mask1 = gimple_call_arg (stmt1, 2);
2890       tree mask2 = gimple_call_arg (stmt2, 2);
2891       if (!operand_equal_p (mask1, mask2, 0))
2892         {
2893           mask1 = strip_conversion (mask1);
2894           if (!mask1)
2895             return false;
2896           mask2 = strip_conversion (mask2);
2897           if (!mask2)
2898             return false;
2899           if (!operand_equal_p (mask1, mask2, 0))
2900             return false;
2901         }
2902       return true;
2903     }
2904
2905   return false;
2906 }
2907
2908 /* Function vect_analyze_data_ref_accesses.
2909
2910    Analyze the access pattern of all the data references in the loop.
2911
2912    FORNOW: the only access pattern that is considered vectorizable is a
2913            simple step 1 (consecutive) access.
2914
2915    FORNOW: handle only arrays and pointer accesses.  */
2916
2917 bool
2918 vect_analyze_data_ref_accesses (vec_info *vinfo)
2919 {
2920   unsigned int i;
2921   vec<data_reference_p> datarefs = vinfo->datarefs;
2922   struct data_reference *dr;
2923
2924   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2925
2926   if (datarefs.is_empty ())
2927     return true;
2928
2929   /* Sort the array of datarefs to make building the interleaving chains
2930      linear.  Don't modify the original vector's order, it is needed for
2931      determining what dependencies are reversed.  */
2932   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2933   datarefs_copy.qsort (dr_group_sort_cmp);
2934
2935   /* Build the interleaving chains.  */
2936   for (i = 0; i < datarefs_copy.length () - 1;)
2937     {
2938       data_reference_p dra = datarefs_copy[i];
2939       stmt_vec_info stmtinfo_a = vinfo_for_stmt (vect_dr_stmt (dra));
2940       stmt_vec_info lastinfo = NULL;
2941       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2942           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2943         {
2944           ++i;
2945           continue;
2946         }
2947       for (i = i + 1; i < datarefs_copy.length (); ++i)
2948         {
2949           data_reference_p drb = datarefs_copy[i];
2950           stmt_vec_info stmtinfo_b = vinfo_for_stmt (vect_dr_stmt (drb));
2951           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2952               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2953             break;
2954
2955           /* ???  Imperfect sorting (non-compatible types, non-modulo
2956              accesses, same accesses) can lead to a group to be artificially
2957              split here as we don't just skip over those.  If it really
2958              matters we can push those to a worklist and re-iterate
2959              over them.  The we can just skip ahead to the next DR here.  */
2960
2961           /* DRs in a different loop should not be put into the same
2962              interleaving group.  */
2963           if (gimple_bb (DR_STMT (dra))->loop_father
2964               != gimple_bb (DR_STMT (drb))->loop_father)
2965             break;
2966
2967           /* Check that the data-refs have same first location (except init)
2968              and they are both either store or load (not load and store,
2969              not masked loads or stores).  */
2970           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2971               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2972                                         DR_BASE_ADDRESS (drb)) != 0
2973               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2974               || !can_group_stmts_p (vect_dr_stmt (dra), vect_dr_stmt (drb)))
2975             break;
2976
2977           /* Check that the data-refs have the same constant size.  */
2978           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2979           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2980           if (!tree_fits_uhwi_p (sza)
2981               || !tree_fits_uhwi_p (szb)
2982               || !tree_int_cst_equal (sza, szb))
2983             break;
2984
2985           /* Check that the data-refs have the same step.  */
2986           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2987             break;
2988
2989           /* Check the types are compatible.
2990              ???  We don't distinguish this during sorting.  */
2991           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2992                                    TREE_TYPE (DR_REF (drb))))
2993             break;
2994
2995           /* Check that the DR_INITs are compile-time constants.  */
2996           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2997               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2998             break;
2999
3000           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3001           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3002           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3003           HOST_WIDE_INT init_prev
3004             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3005           gcc_assert (init_a <= init_b
3006                       && init_a <= init_prev
3007                       && init_prev <= init_b);
3008
3009           /* Do not place the same access in the interleaving chain twice.  */
3010           if (init_b == init_prev)
3011             {
3012               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3013                           < gimple_uid (DR_STMT (drb)));
3014               /* ???  For now we simply "drop" the later reference which is
3015                  otherwise the same rather than finishing off this group.
3016                  In the end we'd want to re-process duplicates forming
3017                  multiple groups from the refs, likely by just collecting
3018                  all candidates (including duplicates and split points
3019                  below) in a vector and then process them together.  */
3020               continue;
3021             }
3022
3023           /* If init_b == init_a + the size of the type * k, we have an
3024              interleaving, and DRA is accessed before DRB.  */
3025           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3026           if (type_size_a == 0
3027               || (init_b - init_a) % type_size_a != 0)
3028             break;
3029
3030           /* If we have a store, the accesses are adjacent.  This splits
3031              groups into chunks we support (we don't support vectorization
3032              of stores with gaps).  */
3033           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3034             break;
3035
3036           /* If the step (if not zero or non-constant) is greater than the
3037              difference between data-refs' inits this splits groups into
3038              suitable sizes.  */
3039           if (tree_fits_shwi_p (DR_STEP (dra)))
3040             {
3041               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3042               if (step != 0 && step <= (init_b - init_a))
3043                 break;
3044             }
3045
3046           if (dump_enabled_p ())
3047             {
3048               dump_printf_loc (MSG_NOTE, vect_location,
3049                                "Detected interleaving ");
3050               if (DR_IS_READ (dra))
3051                 dump_printf (MSG_NOTE, "load ");
3052               else
3053                 dump_printf (MSG_NOTE, "store ");
3054               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3055               dump_printf (MSG_NOTE,  " and ");
3056               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3057               dump_printf (MSG_NOTE, "\n");
3058             }
3059
3060           /* Link the found element into the group list.  */
3061           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3062             {
3063               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = vect_dr_stmt (dra);
3064               lastinfo = stmtinfo_a;
3065             }
3066           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = vect_dr_stmt (dra);
3067           DR_GROUP_NEXT_ELEMENT (lastinfo) = vect_dr_stmt (drb);
3068           lastinfo = stmtinfo_b;
3069         }
3070     }
3071
3072   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3073     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr)))
3074         && !vect_analyze_data_ref_access (dr))
3075       {
3076         if (dump_enabled_p ())
3077           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3078                            "not vectorized: complicated access pattern.\n");
3079
3080         if (is_a <bb_vec_info> (vinfo))
3081           {
3082             /* Mark the statement as not vectorizable.  */
3083             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (vect_dr_stmt (dr))) = false;
3084             continue;
3085           }
3086         else
3087           {
3088             datarefs_copy.release ();
3089             return false;
3090           }
3091       }
3092
3093   datarefs_copy.release ();
3094   return true;
3095 }
3096
3097 /* Function vect_vfa_segment_size.
3098
3099    Input:
3100      DR: The data reference.
3101      LENGTH_FACTOR: segment length to consider.
3102
3103    Return a value suitable for the dr_with_seg_len::seg_len field.
3104    This is the "distance travelled" by the pointer from the first
3105    iteration in the segment to the last.  Note that it does not include
3106    the size of the access; in effect it only describes the first byte.  */
3107
3108 static tree
3109 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3110 {
3111   length_factor = size_binop (MINUS_EXPR,
3112                               fold_convert (sizetype, length_factor),
3113                               size_one_node);
3114   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr)),
3115                      length_factor);
3116 }
3117
3118 /* Return a value that, when added to abs (vect_vfa_segment_size (dr)),
3119    gives the worst-case number of bytes covered by the segment.  */
3120
3121 static unsigned HOST_WIDE_INT
3122 vect_vfa_access_size (data_reference *dr)
3123 {
3124   stmt_vec_info stmt_vinfo = vinfo_for_stmt (vect_dr_stmt (dr));
3125   tree ref_type = TREE_TYPE (DR_REF (dr));
3126   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3127   unsigned HOST_WIDE_INT access_size = ref_size;
3128   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3129     {
3130       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == vect_dr_stmt (dr));
3131       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3132     }
3133   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3134       && (vect_supportable_dr_alignment (dr, false)
3135           == dr_explicit_realign_optimized))
3136     {
3137       /* We might access a full vector's worth.  */
3138       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3139       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3140     }
3141   return access_size;
3142 }
3143
3144 /* Get the minimum alignment for all the scalar accesses that DR describes.  */
3145
3146 static unsigned int
3147 vect_vfa_align (const data_reference *dr)
3148 {
3149   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr)));
3150 }
3151
3152 /* Function vect_no_alias_p.
3153
3154    Given data references A and B with equal base and offset, see whether
3155    the alias relation can be decided at compilation time.  Return 1 if
3156    it can and the references alias, 0 if it can and the references do
3157    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3158    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3159    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3160
3161 static int
3162 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3163                          tree segment_length_a, tree segment_length_b,
3164                          unsigned HOST_WIDE_INT access_size_a,
3165                          unsigned HOST_WIDE_INT access_size_b)
3166 {
3167   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3168   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3169   poly_uint64 const_length_a;
3170   poly_uint64 const_length_b;
3171
3172   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3173      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3174      [a, a+12) */
3175   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3176     {
3177       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3178       offset_a = (offset_a + access_size_a) - const_length_a;
3179     }
3180   else
3181     const_length_a = tree_to_poly_uint64 (segment_length_a);
3182   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3183     {
3184       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3185       offset_b = (offset_b + access_size_b) - const_length_b;
3186     }
3187   else
3188     const_length_b = tree_to_poly_uint64 (segment_length_b);
3189
3190   const_length_a += access_size_a;
3191   const_length_b += access_size_b;
3192
3193   if (ranges_known_overlap_p (offset_a, const_length_a,
3194                               offset_b, const_length_b))
3195     return 1;
3196
3197   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3198                                offset_b, const_length_b))
3199     return 0;
3200
3201   return -1;
3202 }
3203
3204 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3205    in DDR is >= VF.  */
3206
3207 static bool
3208 dependence_distance_ge_vf (data_dependence_relation *ddr,
3209                            unsigned int loop_depth, poly_uint64 vf)
3210 {
3211   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3212       || DDR_NUM_DIST_VECTS (ddr) == 0)
3213     return false;
3214
3215   /* If the dependence is exact, we should have limited the VF instead.  */
3216   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3217
3218   unsigned int i;
3219   lambda_vector dist_v;
3220   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3221     {
3222       HOST_WIDE_INT dist = dist_v[loop_depth];
3223       if (dist != 0
3224           && !(dist > 0 && DDR_REVERSED_P (ddr))
3225           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3226         return false;
3227     }
3228
3229   if (dump_enabled_p ())
3230     {
3231       dump_printf_loc (MSG_NOTE, vect_location,
3232                        "dependence distance between ");
3233       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3234       dump_printf (MSG_NOTE,  " and ");
3235       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3236       dump_printf (MSG_NOTE,  " is >= VF\n");
3237     }
3238
3239   return true;
3240 }
3241
3242 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3243
3244 static void
3245 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3246 {
3247   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3248   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3249   dump_printf (dump_kind, ") >= ");
3250   dump_dec (dump_kind, lower_bound.min_value);
3251 }
3252
3253 /* Record that the vectorized loop requires the vec_lower_bound described
3254    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3255
3256 static void
3257 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3258                         poly_uint64 min_value)
3259 {
3260   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3261   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3262     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3263       {
3264         unsigned_p &= lower_bounds[i].unsigned_p;
3265         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3266         if (lower_bounds[i].unsigned_p != unsigned_p
3267             || maybe_lt (lower_bounds[i].min_value, min_value))
3268           {
3269             lower_bounds[i].unsigned_p = unsigned_p;
3270             lower_bounds[i].min_value = min_value;
3271             if (dump_enabled_p ())
3272               {
3273                 dump_printf_loc (MSG_NOTE, vect_location,
3274                                  "updating run-time check to ");
3275                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3276                 dump_printf (MSG_NOTE, "\n");
3277               }
3278           }
3279         return;
3280       }
3281
3282   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3283   if (dump_enabled_p ())
3284     {
3285       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3286       dump_lower_bound (MSG_NOTE, lower_bound);
3287       dump_printf (MSG_NOTE, "\n");
3288     }
3289   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3290 }
3291
3292 /* Return true if it's unlikely that the step of the vectorized form of DR
3293    will span fewer than GAP bytes.  */
3294
3295 static bool
3296 vect_small_gap_p (loop_vec_info loop_vinfo, data_reference *dr, poly_int64 gap)
3297 {
3298   stmt_vec_info stmt_info = vinfo_for_stmt (vect_dr_stmt (dr));
3299   HOST_WIDE_INT count
3300     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3301   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3302     count *= DR_GROUP_SIZE (vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (stmt_info)));
3303   return estimated_poly_value (gap) <= count * vect_get_scalar_dr_size (dr);
3304 }
3305
3306 /* Return true if we know that there is no alias between DR_A and DR_B
3307    when abs (DR_STEP (DR_A)) >= N for some N.  When returning true, set
3308    *LOWER_BOUND_OUT to this N.  */
3309
3310 static bool
3311 vectorizable_with_step_bound_p (data_reference *dr_a, data_reference *dr_b,
3312                                 poly_uint64 *lower_bound_out)
3313 {
3314   /* Check that there is a constant gap of known sign between DR_A
3315      and DR_B.  */
3316   poly_int64 init_a, init_b;
3317   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3318       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3319       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3320       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3321       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3322       || !ordered_p (init_a, init_b))
3323     return false;
3324
3325   /* Sort DR_A and DR_B by the address they access.  */
3326   if (maybe_lt (init_b, init_a))
3327     {
3328       std::swap (init_a, init_b);
3329       std::swap (dr_a, dr_b);
3330     }
3331
3332   /* If the two accesses could be dependent within a scalar iteration,
3333      make sure that we'd retain their order.  */
3334   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_a), init_b)
3335       && !vect_preserves_scalar_order_p (vect_dr_stmt (dr_a),
3336                                          vect_dr_stmt (dr_b)))
3337     return false;
3338
3339   /* There is no alias if abs (DR_STEP) is greater than or equal to
3340      the bytes spanned by the combination of the two accesses.  */
3341   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_b) - init_a;
3342   return true;
3343 }
3344
3345 /* Function vect_prune_runtime_alias_test_list.
3346
3347    Prune a list of ddrs to be tested at run-time by versioning for alias.
3348    Merge several alias checks into one if possible.
3349    Return FALSE if resulting list of ddrs is longer then allowed by
3350    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3351
3352 bool
3353 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3354 {
3355   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3356   hash_set <tree_pair_hash> compared_objects;
3357
3358   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3359   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3360     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3361   vec<vec_object_pair> &check_unequal_addrs
3362     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3363   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3364   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3365
3366   ddr_p ddr;
3367   unsigned int i;
3368   tree length_factor;
3369
3370   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3371
3372   /* Step values are irrelevant for aliasing if the number of vector
3373      iterations is equal to the number of scalar iterations (which can
3374      happen for fully-SLP loops).  */
3375   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3376
3377   if (!ignore_step_p)
3378     {
3379       /* Convert the checks for nonzero steps into bound tests.  */
3380       tree value;
3381       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3382         vect_check_lower_bound (loop_vinfo, value, true, 1);
3383     }
3384
3385   if (may_alias_ddrs.is_empty ())
3386     return true;
3387
3388   comp_alias_ddrs.create (may_alias_ddrs.length ());
3389
3390   unsigned int loop_depth
3391     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3392                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3393
3394   /* First, we collect all data ref pairs for aliasing checks.  */
3395   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3396     {
3397       int comp_res;
3398       poly_uint64 lower_bound;
3399       struct data_reference *dr_a, *dr_b;
3400       gimple *dr_group_first_a, *dr_group_first_b;
3401       tree segment_length_a, segment_length_b;
3402       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3403       unsigned int align_a, align_b;
3404       gimple *stmt_a, *stmt_b;
3405
3406       /* Ignore the alias if the VF we chose ended up being no greater
3407          than the dependence distance.  */
3408       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3409         continue;
3410
3411       if (DDR_OBJECT_A (ddr))
3412         {
3413           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3414           if (!compared_objects.add (new_pair))
3415             {
3416               if (dump_enabled_p ())
3417                 {
3418                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3419                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3420                   dump_printf (MSG_NOTE, " and ");
3421                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3422                   dump_printf (MSG_NOTE, " have different addresses\n");
3423                 }
3424               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3425             }
3426           continue;
3427         }
3428
3429       dr_a = DDR_A (ddr);
3430       stmt_a = vect_dr_stmt (DDR_A (ddr));
3431
3432       dr_b = DDR_B (ddr);
3433       stmt_b = vect_dr_stmt (DDR_B (ddr));
3434
3435       /* Skip the pair if inter-iteration dependencies are irrelevant
3436          and intra-iteration dependencies are guaranteed to be honored.  */
3437       if (ignore_step_p
3438           && (vect_preserves_scalar_order_p (stmt_a, stmt_b)
3439               || vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)))
3440         {
3441           if (dump_enabled_p ())
3442             {
3443               dump_printf_loc (MSG_NOTE, vect_location,
3444                                "no need for alias check between ");
3445               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3446               dump_printf (MSG_NOTE, " and ");
3447               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3448               dump_printf (MSG_NOTE, " when VF is 1\n");
3449             }
3450           continue;
3451         }
3452
3453       /* See whether we can handle the alias using a bounds check on
3454          the step, and whether that's likely to be the best approach.
3455          (It might not be, for example, if the minimum step is much larger
3456          than the number of bytes handled by one vector iteration.)  */
3457       if (!ignore_step_p
3458           && TREE_CODE (DR_STEP (dr_a)) != INTEGER_CST
3459           && vectorizable_with_step_bound_p (dr_a, dr_b, &lower_bound)
3460           && (vect_small_gap_p (loop_vinfo, dr_a, lower_bound)
3461               || vect_small_gap_p (loop_vinfo, dr_b, lower_bound)))
3462         {
3463           bool unsigned_p = dr_known_forward_stride_p (dr_a);
3464           if (dump_enabled_p ())
3465             {
3466               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3467               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3468               dump_printf (MSG_NOTE, " and ");
3469               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3470               dump_printf (MSG_NOTE, " when the step ");
3471               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_a));
3472               dump_printf (MSG_NOTE, " is outside ");
3473               if (unsigned_p)
3474                 dump_printf (MSG_NOTE, "[0");
3475               else
3476                 {
3477                   dump_printf (MSG_NOTE, "(");
3478                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3479                 }
3480               dump_printf (MSG_NOTE, ", ");
3481               dump_dec (MSG_NOTE, lower_bound);
3482               dump_printf (MSG_NOTE, ")\n");
3483             }
3484           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_a), unsigned_p,
3485                                   lower_bound);
3486           continue;
3487         }
3488
3489       dr_group_first_a = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3490       if (dr_group_first_a)
3491         {
3492           stmt_a = dr_group_first_a;
3493           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3494         }
3495
3496       dr_group_first_b = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3497       if (dr_group_first_b)
3498         {
3499           stmt_b = dr_group_first_b;
3500           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3501         }
3502
3503       if (ignore_step_p)
3504         {
3505           segment_length_a = size_zero_node;
3506           segment_length_b = size_zero_node;
3507         }
3508       else
3509         {
3510           if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3511             length_factor = scalar_loop_iters;
3512           else
3513             length_factor = size_int (vect_factor);
3514           segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3515           segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3516         }
3517       access_size_a = vect_vfa_access_size (dr_a);
3518       access_size_b = vect_vfa_access_size (dr_b);
3519       align_a = vect_vfa_align (dr_a);
3520       align_b = vect_vfa_align (dr_b);
3521
3522       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3523                                         DR_BASE_ADDRESS (dr_b));
3524       if (comp_res == 0)
3525         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3526                                           DR_OFFSET (dr_b));
3527
3528       /* See whether the alias is known at compilation time.  */
3529       if (comp_res == 0
3530           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3531           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3532           && poly_int_tree_p (segment_length_a)
3533           && poly_int_tree_p (segment_length_b))
3534         {
3535           int res = vect_compile_time_alias (dr_a, dr_b,
3536                                              segment_length_a,
3537                                              segment_length_b,
3538                                              access_size_a,
3539                                              access_size_b);
3540           if (res >= 0 && dump_enabled_p ())
3541             {
3542               dump_printf_loc (MSG_NOTE, vect_location,
3543                                "can tell at compile time that ");
3544               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a));
3545               dump_printf (MSG_NOTE, " and ");
3546               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b));
3547               if (res == 0)
3548                 dump_printf (MSG_NOTE, " do not alias\n");
3549               else
3550                 dump_printf (MSG_NOTE, " alias\n");
3551             }
3552
3553           if (res == 0)
3554             continue;
3555
3556           if (res == 1)
3557             {
3558               if (dump_enabled_p ())
3559                 dump_printf_loc (MSG_NOTE, vect_location,
3560                                  "not vectorized: compilation time alias.\n");
3561               return false;
3562             }
3563         }
3564
3565       dr_with_seg_len_pair_t dr_with_seg_len_pair
3566         (dr_with_seg_len (dr_a, segment_length_a, access_size_a, align_a),
3567          dr_with_seg_len (dr_b, segment_length_b, access_size_b, align_b));
3568
3569       /* Canonicalize pairs by sorting the two DR members.  */
3570       if (comp_res > 0)
3571         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3572
3573       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3574     }
3575
3576   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3577
3578   unsigned int count = (comp_alias_ddrs.length ()
3579                         + check_unequal_addrs.length ());
3580
3581   dump_printf_loc (MSG_NOTE, vect_location,
3582                    "improved number of alias checks from %d to %d\n",
3583                    may_alias_ddrs.length (), count);
3584   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3585     {
3586       if (dump_enabled_p ())
3587         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3588                          "number of versioning for alias "
3589                          "run-time tests exceeds %d "
3590                          "(--param vect-max-version-for-alias-checks)\n",
3591                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3592       return false;
3593     }
3594
3595   return true;
3596 }
3597
3598 /* Check whether we can use an internal function for a gather load
3599    or scatter store.  READ_P is true for loads and false for stores.
3600    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3601    the type of the memory elements being loaded or stored.  OFFSET_BITS
3602    is the number of bits in each scalar offset and OFFSET_SIGN is the
3603    sign of the offset.  SCALE is the amount by which the offset should
3604    be multiplied *after* it has been converted to address width.
3605
3606    Return true if the function is supported, storing the function
3607    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3608
3609 bool
3610 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3611                           tree memory_type, unsigned int offset_bits,
3612                           signop offset_sign, int scale,
3613                           internal_fn *ifn_out, tree *element_type_out)
3614 {
3615   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3616   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3617   if (offset_bits > element_bits)
3618     /* Internal functions require the offset to be the same width as
3619        the vector elements.  We can extend narrower offsets, but it isn't
3620        safe to truncate wider offsets.  */
3621     return false;
3622
3623   if (element_bits != memory_bits)
3624     /* For now the vector elements must be the same width as the
3625        memory elements.  */
3626     return false;
3627
3628   /* Work out which function we need.  */
3629   internal_fn ifn;
3630   if (read_p)
3631     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3632   else
3633     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3634
3635   /* Test whether the target supports this combination.  */
3636   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3637                                                offset_sign, scale))
3638     return false;
3639
3640   *ifn_out = ifn;
3641   *element_type_out = TREE_TYPE (vectype);
3642   return true;
3643 }
3644
3645 /* CALL is a call to an internal gather load or scatter store function.
3646    Describe the operation in INFO.  */
3647
3648 static void
3649 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3650 {
3651   stmt_vec_info stmt_info = vinfo_for_stmt (call);
3652   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3653   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3654
3655   info->ifn = gimple_call_internal_fn (call);
3656   info->decl = NULL_TREE;
3657   info->base = gimple_call_arg (call, 0);
3658   info->offset = gimple_call_arg (call, 1);
3659   info->offset_dt = vect_unknown_def_type;
3660   info->offset_vectype = NULL_TREE;
3661   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3662   info->element_type = TREE_TYPE (vectype);
3663   info->memory_type = TREE_TYPE (DR_REF (dr));
3664 }
3665
3666 /* Return true if a non-affine read or write in STMT is suitable for a
3667    gather load or scatter store.  Describe the operation in *INFO if so.  */
3668
3669 bool
3670 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3671                            gather_scatter_info *info)
3672 {
3673   HOST_WIDE_INT scale = 1;
3674   poly_int64 pbitpos, pbitsize;
3675   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3676   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3677   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3678   tree offtype = NULL_TREE;
3679   tree decl = NULL_TREE, base, off;
3680   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3681   tree memory_type = TREE_TYPE (DR_REF (dr));
3682   machine_mode pmode;
3683   int punsignedp, reversep, pvolatilep = 0;
3684   internal_fn ifn;
3685   tree element_type;
3686   bool masked_p = false;
3687
3688   /* See whether this is already a call to a gather/scatter internal function.
3689      If not, see whether it's a masked load or store.  */
3690   gcall *call = dyn_cast <gcall *> (stmt);
3691   if (call && gimple_call_internal_p (call))
3692     {
3693       ifn = gimple_call_internal_fn (stmt);
3694       if (internal_gather_scatter_fn_p (ifn))
3695         {
3696           vect_describe_gather_scatter_call (call, info);
3697           return true;
3698         }
3699       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3700     }
3701
3702   /* True if we should aim to use internal functions rather than
3703      built-in functions.  */
3704   bool use_ifn_p = (DR_IS_READ (dr)
3705                     ? supports_vec_gather_load_p ()
3706                     : supports_vec_scatter_store_p ());
3707
3708   base = DR_REF (dr);
3709   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3710      see if we can use the def stmt of the address.  */
3711   if (masked_p
3712       && TREE_CODE (base) == MEM_REF
3713       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3714       && integer_zerop (TREE_OPERAND (base, 1))
3715       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3716     {
3717       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3718       if (is_gimple_assign (def_stmt)
3719           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3720         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3721     }
3722
3723   /* The gather and scatter builtins need address of the form
3724      loop_invariant + vector * {1, 2, 4, 8}
3725      or
3726      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3727      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3728      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3729      multiplications and additions in it.  To get a vector, we need
3730      a single SSA_NAME that will be defined in the loop and will
3731      contain everything that is not loop invariant and that can be
3732      vectorized.  The following code attempts to find such a preexistng
3733      SSA_NAME OFF and put the loop invariants into a tree BASE
3734      that can be gimplified before the loop.  */
3735   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3736                               &punsignedp, &reversep, &pvolatilep);
3737   gcc_assert (base && !reversep);
3738   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3739
3740   if (TREE_CODE (base) == MEM_REF)
3741     {
3742       if (!integer_zerop (TREE_OPERAND (base, 1)))
3743         {
3744           if (off == NULL_TREE)
3745             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3746           else
3747             off = size_binop (PLUS_EXPR, off,
3748                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3749         }
3750       base = TREE_OPERAND (base, 0);
3751     }
3752   else
3753     base = build_fold_addr_expr (base);
3754
3755   if (off == NULL_TREE)
3756     off = size_zero_node;
3757
3758   /* If base is not loop invariant, either off is 0, then we start with just
3759      the constant offset in the loop invariant BASE and continue with base
3760      as OFF, otherwise give up.
3761      We could handle that case by gimplifying the addition of base + off
3762      into some SSA_NAME and use that as off, but for now punt.  */
3763   if (!expr_invariant_in_loop_p (loop, base))
3764     {
3765       if (!integer_zerop (off))
3766         return false;
3767       off = base;
3768       base = size_int (pbytepos);
3769     }
3770   /* Otherwise put base + constant offset into the loop invariant BASE
3771      and continue with OFF.  */
3772   else
3773     {
3774       base = fold_convert (sizetype, base);
3775       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3776     }
3777
3778   /* OFF at this point may be either a SSA_NAME or some tree expression
3779      from get_inner_reference.  Try to peel off loop invariants from it
3780      into BASE as long as possible.  */
3781   STRIP_NOPS (off);
3782   while (offtype == NULL_TREE)
3783     {
3784       enum tree_code code;
3785       tree op0, op1, add = NULL_TREE;
3786
3787       if (TREE_CODE (off) == SSA_NAME)
3788         {
3789           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3790
3791           if (expr_invariant_in_loop_p (loop, off))
3792             return false;
3793
3794           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3795             break;
3796
3797           op0 = gimple_assign_rhs1 (def_stmt);
3798           code = gimple_assign_rhs_code (def_stmt);
3799           op1 = gimple_assign_rhs2 (def_stmt);
3800         }
3801       else
3802         {
3803           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3804             return false;
3805           code = TREE_CODE (off);
3806           extract_ops_from_tree (off, &code, &op0, &op1);
3807         }
3808       switch (code)
3809         {
3810         case POINTER_PLUS_EXPR:
3811         case PLUS_EXPR:
3812           if (expr_invariant_in_loop_p (loop, op0))
3813             {
3814               add = op0;
3815               off = op1;
3816             do_add:
3817               add = fold_convert (sizetype, add);
3818               if (scale != 1)
3819                 add = size_binop (MULT_EXPR, add, size_int (scale));
3820               base = size_binop (PLUS_EXPR, base, add);
3821               continue;
3822             }
3823           if (expr_invariant_in_loop_p (loop, op1))
3824             {
3825               add = op1;
3826               off = op0;
3827               goto do_add;
3828             }
3829           break;
3830         case MINUS_EXPR:
3831           if (expr_invariant_in_loop_p (loop, op1))
3832             {
3833               add = fold_convert (sizetype, op1);
3834               add = size_binop (MINUS_EXPR, size_zero_node, add);
3835               off = op0;
3836               goto do_add;
3837             }
3838           break;
3839         case MULT_EXPR:
3840           if (scale == 1 && tree_fits_shwi_p (op1))
3841             {
3842               int new_scale = tree_to_shwi (op1);
3843               /* Only treat this as a scaling operation if the target
3844                  supports it.  */
3845               if (use_ifn_p
3846                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3847                                                 vectype, memory_type, 1,
3848                                                 TYPE_SIGN (TREE_TYPE (op0)),
3849                                                 new_scale, &ifn,
3850                                                 &element_type))
3851                 break;
3852               scale = new_scale;
3853               off = op0;
3854               continue;
3855             }
3856           break;
3857         case SSA_NAME:
3858           off = op0;
3859           continue;
3860         CASE_CONVERT:
3861           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3862               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3863             break;
3864           if (TYPE_PRECISION (TREE_TYPE (op0))
3865               == TYPE_PRECISION (TREE_TYPE (off)))
3866             {
3867               off = op0;
3868               continue;
3869             }
3870
3871           /* The internal functions need the offset to be the same width
3872              as the elements of VECTYPE.  Don't include operations that
3873              cast the offset from that width to a different width.  */
3874           if (use_ifn_p
3875               && (int_size_in_bytes (TREE_TYPE (vectype))
3876                   == int_size_in_bytes (TREE_TYPE (off))))
3877             break;
3878
3879           if (TYPE_PRECISION (TREE_TYPE (op0))
3880               < TYPE_PRECISION (TREE_TYPE (off)))
3881             {
3882               off = op0;
3883               offtype = TREE_TYPE (off);
3884               STRIP_NOPS (off);
3885               continue;
3886             }
3887           break;
3888         default:
3889           break;
3890         }
3891       break;
3892     }
3893
3894   /* If at the end OFF still isn't a SSA_NAME or isn't
3895      defined in the loop, punt.  */
3896   if (TREE_CODE (off) != SSA_NAME
3897       || expr_invariant_in_loop_p (loop, off))
3898     return false;
3899
3900   if (offtype == NULL_TREE)
3901     offtype = TREE_TYPE (off);
3902
3903   if (use_ifn_p)
3904     {
3905       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3906                                      memory_type, TYPE_PRECISION (offtype),
3907                                      TYPE_SIGN (offtype), scale, &ifn,
3908                                      &element_type))
3909         return false;
3910     }
3911   else
3912     {
3913       if (DR_IS_READ (dr))
3914         {
3915           if (targetm.vectorize.builtin_gather)
3916             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3917         }
3918       else
3919         {
3920           if (targetm.vectorize.builtin_scatter)
3921             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3922         }
3923
3924       if (!decl)
3925         return false;
3926
3927       ifn = IFN_LAST;
3928       element_type = TREE_TYPE (vectype);
3929     }
3930
3931   info->ifn = ifn;
3932   info->decl = decl;
3933   info->base = base;
3934   info->offset = off;
3935   info->offset_dt = vect_unknown_def_type;
3936   info->offset_vectype = NULL_TREE;
3937   info->scale = scale;
3938   info->element_type = element_type;
3939   info->memory_type = memory_type;
3940   return true;
3941 }
3942
3943 /* Find the data references in STMT, analyze them with respect to LOOP and
3944    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3945    be handled.  */
3946
3947 bool
3948 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3949                                vec<data_reference_p> *datarefs)
3950 {
3951   /* We can ignore clobbers for dataref analysis - they are removed during
3952      loop vectorization and BB vectorization checks dependences with a
3953      stmt walk.  */
3954   if (gimple_clobber_p (stmt))
3955     return true;
3956
3957   if (gimple_has_volatile_ops (stmt))
3958     {
3959       if (dump_enabled_p ())
3960         {
3961           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3962                            "not vectorized: volatile type ");
3963           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3964         }
3965       return false;
3966     }
3967
3968   if (stmt_can_throw_internal (stmt))
3969     {
3970       if (dump_enabled_p ())
3971         {
3972           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3973                            "not vectorized: statement can throw an "
3974                            "exception ");
3975           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3976         }
3977       return false;
3978     }
3979
3980   auto_vec<data_reference_p, 2> refs;
3981   if (!find_data_references_in_stmt (loop, stmt, &refs))
3982     return false;
3983
3984   if (refs.is_empty ())
3985     return true;
3986
3987   if (refs.length () > 1)
3988     {
3989       if (dump_enabled_p ())
3990         {
3991           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3992                            "not vectorized: more than one data ref "
3993                            "in stmt: ");
3994           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3995         }
3996       return false;
3997     }
3998
3999   if (gcall *call = dyn_cast <gcall *> (stmt))
4000     if (!gimple_call_internal_p (call)
4001         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4002             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4003       {
4004         if (dump_enabled_p ())
4005           {
4006             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4007                              "not vectorized: dr in a call ");
4008             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4009           }
4010         return false;
4011       }
4012
4013   data_reference_p dr = refs.pop ();
4014   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4015       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4016     {
4017       if (dump_enabled_p ())
4018         {
4019           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4020                            "not vectorized: statement is bitfield "
4021                            "access ");
4022           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4023         }
4024       return false;
4025     }
4026
4027   if (DR_BASE_ADDRESS (dr)
4028       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4029     {
4030       if (dump_enabled_p ())
4031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4032                          "not vectorized: base addr of dr is a "
4033                          "constant\n");
4034       return false;
4035     }
4036
4037   datarefs->safe_push (dr);
4038   return true;
4039 }
4040
4041 /* Function vect_analyze_data_refs.
4042
4043   Find all the data references in the loop or basic block.
4044
4045    The general structure of the analysis of data refs in the vectorizer is as
4046    follows:
4047    1- vect_analyze_data_refs(loop/bb): call
4048       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4049       in the loop/bb and their dependences.
4050    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4051    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4052    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4053
4054 */
4055
4056 bool
4057 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4058 {
4059   struct loop *loop = NULL;
4060   unsigned int i;
4061   struct data_reference *dr;
4062   tree scalar_type;
4063
4064   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4065
4066   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4067     loop = LOOP_VINFO_LOOP (loop_vinfo);
4068
4069   /* Go through the data-refs, check that the analysis succeeded.  Update
4070      pointer from stmt_vec_info struct to DR and vectype.  */
4071
4072   vec<data_reference_p> datarefs = vinfo->datarefs;
4073   FOR_EACH_VEC_ELT (datarefs, i, dr)
4074     {
4075       gimple *stmt;
4076       stmt_vec_info stmt_info;
4077       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4078       bool simd_lane_access = false;
4079       poly_uint64 vf;
4080
4081       gcc_assert (DR_REF (dr));
4082       stmt = vect_dr_stmt (dr);
4083       stmt_info = vinfo_for_stmt (stmt);
4084
4085       /* Check that analysis of the data-ref succeeded.  */
4086       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4087           || !DR_STEP (dr))
4088         {
4089           bool maybe_gather
4090             = DR_IS_READ (dr)
4091               && !TREE_THIS_VOLATILE (DR_REF (dr))
4092               && (targetm.vectorize.builtin_gather != NULL
4093                   || supports_vec_gather_load_p ());
4094           bool maybe_scatter
4095             = DR_IS_WRITE (dr)
4096               && !TREE_THIS_VOLATILE (DR_REF (dr))
4097               && (targetm.vectorize.builtin_scatter != NULL
4098                   || supports_vec_scatter_store_p ());
4099           bool maybe_simd_lane_access
4100             = is_a <loop_vec_info> (vinfo) && loop->simduid;
4101
4102           /* If target supports vector gather loads or scatter stores, or if
4103              this might be a SIMD lane access, see if they can't be used.  */
4104           if (is_a <loop_vec_info> (vinfo)
4105               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
4106               && !nested_in_vect_loop_p (loop, stmt))
4107             {
4108               struct data_reference *newdr
4109                 = create_data_ref (NULL, loop_containing_stmt (stmt),
4110                                    DR_REF (dr), stmt, !maybe_scatter,
4111                                    DR_IS_CONDITIONAL_IN_STMT (dr));
4112               gcc_assert (newdr != NULL && DR_REF (newdr));
4113               if (DR_BASE_ADDRESS (newdr)
4114                   && DR_OFFSET (newdr)
4115                   && DR_INIT (newdr)
4116                   && DR_STEP (newdr)
4117                   && integer_zerop (DR_STEP (newdr)))
4118                 {
4119                   if (maybe_simd_lane_access)
4120                     {
4121                       tree off = DR_OFFSET (newdr);
4122                       STRIP_NOPS (off);
4123                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4124                           && TREE_CODE (off) == MULT_EXPR
4125                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4126                         {
4127                           tree step = TREE_OPERAND (off, 1);
4128                           off = TREE_OPERAND (off, 0);
4129                           STRIP_NOPS (off);
4130                           if (CONVERT_EXPR_P (off)
4131                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
4132                                                                           0)))
4133                                  < TYPE_PRECISION (TREE_TYPE (off)))
4134                             off = TREE_OPERAND (off, 0);
4135                           if (TREE_CODE (off) == SSA_NAME)
4136                             {
4137                               gimple *def = SSA_NAME_DEF_STMT (off);
4138                               tree reft = TREE_TYPE (DR_REF (newdr));
4139                               if (is_gimple_call (def)
4140                                   && gimple_call_internal_p (def)
4141                                   && (gimple_call_internal_fn (def)
4142                                       == IFN_GOMP_SIMD_LANE))
4143                                 {
4144                                   tree arg = gimple_call_arg (def, 0);
4145                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
4146                                   arg = SSA_NAME_VAR (arg);
4147                                   if (arg == loop->simduid
4148                                       /* For now.  */
4149                                       && tree_int_cst_equal
4150                                            (TYPE_SIZE_UNIT (reft),
4151                                             step))
4152                                     {
4153                                       DR_OFFSET (newdr) = ssize_int (0);
4154                                       DR_STEP (newdr) = step;
4155                                       DR_OFFSET_ALIGNMENT (newdr)
4156                                         = BIGGEST_ALIGNMENT;
4157                                       DR_STEP_ALIGNMENT (newdr)
4158                                         = highest_pow2_factor (step);
4159                                       dr = newdr;
4160                                       simd_lane_access = true;
4161                                     }
4162                                 }
4163                             }
4164                         }
4165                     }
4166                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
4167                     {
4168                       dr = newdr;
4169                       if (maybe_gather)
4170                         gatherscatter = GATHER;
4171                       else
4172                         gatherscatter = SCATTER;
4173                     }
4174                 }
4175               if (gatherscatter == SG_NONE && !simd_lane_access)
4176                 free_data_ref (newdr);
4177             }
4178
4179           if (gatherscatter == SG_NONE && !simd_lane_access)
4180             {
4181               if (dump_enabled_p ())
4182                 {
4183                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4184                                    "not vectorized: data ref analysis "
4185                                    "failed ");
4186                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4187                 }
4188               if (is_a <bb_vec_info> (vinfo))
4189                 {
4190                   /* In BB vectorization the ref can still participate
4191                      in dependence analysis, we just can't vectorize it.  */
4192                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4193                   continue;
4194                 }
4195               return false;
4196             }
4197         }
4198
4199       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
4200           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4201           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4202         {
4203           if (dump_enabled_p ())
4204             {
4205               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4206                                "not vectorized: base object not addressable "
4207                                "for stmt: ");
4208               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4209             }
4210           if (is_a <bb_vec_info> (vinfo))
4211             {
4212               /* In BB vectorization the ref can still participate
4213                  in dependence analysis, we just can't vectorize it.  */
4214               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4215               continue;
4216             }
4217           return false;
4218         }
4219
4220       if (is_a <loop_vec_info> (vinfo)
4221           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4222         {
4223           if (nested_in_vect_loop_p (loop, stmt))
4224             {
4225               if (dump_enabled_p ())
4226                 {
4227                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228                                    "not vectorized: not suitable for strided "
4229                                    "load ");
4230                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4231                 }
4232               return false;
4233             }
4234           STMT_VINFO_STRIDED_P (stmt_info) = true;
4235         }
4236
4237       /* Update DR field in stmt_vec_info struct.  */
4238
4239       /* If the dataref is in an inner-loop of the loop that is considered for
4240          for vectorization, we also want to analyze the access relative to
4241          the outer-loop (DR contains information only relative to the
4242          inner-most enclosing loop).  We do that by building a reference to the
4243          first location accessed by the inner-loop, and analyze it relative to
4244          the outer-loop.  */
4245       if (loop && nested_in_vect_loop_p (loop, stmt))
4246         {
4247           /* Build a reference to the first location accessed by the
4248              inner loop: *(BASE + INIT + OFFSET).  By construction,
4249              this address must be invariant in the inner loop, so we
4250              can consider it as being used in the outer loop.  */
4251           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4252           tree offset = unshare_expr (DR_OFFSET (dr));
4253           tree init = unshare_expr (DR_INIT (dr));
4254           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4255                                           init, offset);
4256           tree init_addr = fold_build_pointer_plus (base, init_offset);
4257           tree init_ref = build_fold_indirect_ref (init_addr);
4258
4259           if (dump_enabled_p ())
4260             {
4261               dump_printf_loc (MSG_NOTE, vect_location,
4262                                "analyze in outer loop: ");
4263               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4264               dump_printf (MSG_NOTE, "\n");
4265             }
4266
4267           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4268                                      init_ref, loop))
4269             /* dr_analyze_innermost already explained the failure.  */
4270             return false;
4271
4272           if (dump_enabled_p ())
4273             {
4274               dump_printf_loc (MSG_NOTE, vect_location,
4275                                "\touter base_address: ");
4276               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4277                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4278               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4279               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4280                                  STMT_VINFO_DR_OFFSET (stmt_info));
4281               dump_printf (MSG_NOTE,
4282                            "\n\touter constant offset from base address: ");
4283               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4284                                  STMT_VINFO_DR_INIT (stmt_info));
4285               dump_printf (MSG_NOTE, "\n\touter step: ");
4286               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4287                                  STMT_VINFO_DR_STEP (stmt_info));
4288               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4289                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4290               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4291                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4292               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4293                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4294               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4295                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4296             }
4297         }
4298
4299       gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
4300       STMT_VINFO_DATA_REF (stmt_info) = dr;
4301       if (simd_lane_access)
4302         {
4303           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4304           free_data_ref (datarefs[i]);
4305           datarefs[i] = dr;
4306         }
4307
4308       /* Set vectype for STMT.  */
4309       scalar_type = TREE_TYPE (DR_REF (dr));
4310       STMT_VINFO_VECTYPE (stmt_info)
4311         = get_vectype_for_scalar_type (scalar_type);
4312       if (!STMT_VINFO_VECTYPE (stmt_info))
4313         {
4314           if (dump_enabled_p ())
4315             {
4316               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4317                                "not vectorized: no vectype for stmt: ");
4318               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4319               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4320               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4321                                  scalar_type);
4322               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4323             }
4324
4325           if (is_a <bb_vec_info> (vinfo))
4326             {
4327               /* No vector type is fine, the ref can still participate
4328                  in dependence analysis, we just can't vectorize it.  */
4329               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4330               continue;
4331             }
4332
4333           if (gatherscatter != SG_NONE || simd_lane_access)
4334             {
4335               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4336               if (gatherscatter != SG_NONE)
4337                 free_data_ref (dr);
4338             }
4339           return false;
4340         }
4341       else
4342         {
4343           if (dump_enabled_p ())
4344             {
4345               dump_printf_loc (MSG_NOTE, vect_location,
4346                                "got vectype for stmt: ");
4347               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4348               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4349                                  STMT_VINFO_VECTYPE (stmt_info));
4350               dump_printf (MSG_NOTE, "\n");
4351             }
4352         }
4353
4354       /* Adjust the minimal vectorization factor according to the
4355          vector type.  */
4356       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4357       *min_vf = upper_bound (*min_vf, vf);
4358
4359       if (gatherscatter != SG_NONE)
4360         {
4361           gather_scatter_info gs_info;
4362           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4363                                           &gs_info)
4364               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4365             {
4366               STMT_VINFO_DATA_REF (stmt_info) = NULL;
4367               free_data_ref (dr);
4368               if (dump_enabled_p ())
4369                 {
4370                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4371                                    (gatherscatter == GATHER) ?
4372                                    "not vectorized: not suitable for gather "
4373                                    "load " :
4374                                    "not vectorized: not suitable for scatter "
4375                                    "store ");
4376                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4377                 }
4378               return false;
4379             }
4380
4381           free_data_ref (datarefs[i]);
4382           datarefs[i] = dr;
4383           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4384         }
4385     }
4386
4387   /* We used to stop processing and prune the list here.  Verify we no
4388      longer need to.  */
4389   gcc_assert (i == datarefs.length ());
4390
4391   return true;
4392 }
4393
4394
4395 /* Function vect_get_new_vect_var.
4396
4397    Returns a name for a new variable.  The current naming scheme appends the
4398    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4399    the name of vectorizer generated variables, and appends that to NAME if
4400    provided.  */
4401
4402 tree
4403 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4404 {
4405   const char *prefix;
4406   tree new_vect_var;
4407
4408   switch (var_kind)
4409   {
4410   case vect_simple_var:
4411     prefix = "vect";
4412     break;
4413   case vect_scalar_var:
4414     prefix = "stmp";
4415     break;
4416   case vect_mask_var:
4417     prefix = "mask";
4418     break;
4419   case vect_pointer_var:
4420     prefix = "vectp";
4421     break;
4422   default:
4423     gcc_unreachable ();
4424   }
4425
4426   if (name)
4427     {
4428       char* tmp = concat (prefix, "_", name, NULL);
4429       new_vect_var = create_tmp_reg (type, tmp);
4430       free (tmp);
4431     }
4432   else
4433     new_vect_var = create_tmp_reg (type, prefix);
4434
4435   return new_vect_var;
4436 }
4437
4438 /* Like vect_get_new_vect_var but return an SSA name.  */
4439
4440 tree
4441 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4442 {
4443   const char *prefix;
4444   tree new_vect_var;
4445
4446   switch (var_kind)
4447   {
4448   case vect_simple_var:
4449     prefix = "vect";
4450     break;
4451   case vect_scalar_var:
4452     prefix = "stmp";
4453     break;
4454   case vect_pointer_var:
4455     prefix = "vectp";
4456     break;
4457   default:
4458     gcc_unreachable ();
4459   }
4460
4461   if (name)
4462     {
4463       char* tmp = concat (prefix, "_", name, NULL);
4464       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4465       free (tmp);
4466     }
4467   else
4468     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4469
4470   return new_vect_var;
4471 }
4472
4473 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4474
4475 static void
4476 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4477 {
4478   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4479   int misalign = DR_MISALIGNMENT (dr);
4480   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4481     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4482   else
4483     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4484                             DR_TARGET_ALIGNMENT (dr), misalign);
4485 }
4486
4487 /* Function vect_create_addr_base_for_vector_ref.
4488
4489    Create an expression that computes the address of the first memory location
4490    that will be accessed for a data reference.
4491
4492    Input:
4493    STMT: The statement containing the data reference.
4494    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4495    OFFSET: Optional. If supplied, it is be added to the initial address.
4496    LOOP:    Specify relative to which loop-nest should the address be computed.
4497             For example, when the dataref is in an inner-loop nested in an
4498             outer-loop that is now being vectorized, LOOP can be either the
4499             outer-loop, or the inner-loop.  The first memory location accessed
4500             by the following dataref ('in' points to short):
4501
4502                 for (i=0; i<N; i++)
4503                    for (j=0; j<M; j++)
4504                      s += in[i+j]
4505
4506             is as follows:
4507             if LOOP=i_loop:     &in             (relative to i_loop)
4508             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4509    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4510             initial address.  Unlike OFFSET, which is number of elements to
4511             be added, BYTE_OFFSET is measured in bytes.
4512
4513    Output:
4514    1. Return an SSA_NAME whose value is the address of the memory location of
4515       the first vector of the data reference.
4516    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4517       these statement(s) which define the returned SSA_NAME.
4518
4519    FORNOW: We are only handling array accesses with step 1.  */
4520
4521 tree
4522 vect_create_addr_base_for_vector_ref (gimple *stmt,
4523                                       gimple_seq *new_stmt_list,
4524                                       tree offset,
4525                                       tree byte_offset)
4526 {
4527   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4528   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4529   const char *base_name;
4530   tree addr_base;
4531   tree dest;
4532   gimple_seq seq = NULL;
4533   tree vect_ptr_type;
4534   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4535   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4536   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4537
4538   tree data_ref_base = unshare_expr (drb->base_address);
4539   tree base_offset = unshare_expr (drb->offset);
4540   tree init = unshare_expr (drb->init);
4541
4542   if (loop_vinfo)
4543     base_name = get_name (data_ref_base);
4544   else
4545     {
4546       base_offset = ssize_int (0);
4547       init = ssize_int (0);
4548       base_name = get_name (DR_REF (dr));
4549     }
4550
4551   /* Create base_offset */
4552   base_offset = size_binop (PLUS_EXPR,
4553                             fold_convert (sizetype, base_offset),
4554                             fold_convert (sizetype, init));
4555
4556   if (offset)
4557     {
4558       offset = fold_build2 (MULT_EXPR, sizetype,
4559                             fold_convert (sizetype, offset), step);
4560       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4561                                  base_offset, offset);
4562     }
4563   if (byte_offset)
4564     {
4565       byte_offset = fold_convert (sizetype, byte_offset);
4566       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4567                                  base_offset, byte_offset);
4568     }
4569
4570   /* base + base_offset */
4571   if (loop_vinfo)
4572     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4573   else
4574     {
4575       addr_base = build1 (ADDR_EXPR,
4576                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4577                           unshare_expr (DR_REF (dr)));
4578     }
4579
4580   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4581   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4582   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4583   gimple_seq_add_seq (new_stmt_list, seq);
4584
4585   if (DR_PTR_INFO (dr)
4586       && TREE_CODE (addr_base) == SSA_NAME
4587       && !SSA_NAME_PTR_INFO (addr_base))
4588     {
4589       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4590       if (offset || byte_offset)
4591         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4592     }
4593
4594   if (dump_enabled_p ())
4595     {
4596       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4597       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4598       dump_printf (MSG_NOTE, "\n");
4599     }
4600
4601   return addr_base;
4602 }
4603
4604
4605 /* Function vect_create_data_ref_ptr.
4606
4607    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4608    location accessed in the loop by STMT, along with the def-use update
4609    chain to appropriately advance the pointer through the loop iterations.
4610    Also set aliasing information for the pointer.  This pointer is used by
4611    the callers to this function to create a memory reference expression for
4612    vector load/store access.
4613
4614    Input:
4615    1. STMT: a stmt that references memory. Expected to be of the form
4616          GIMPLE_ASSIGN <name, data-ref> or
4617          GIMPLE_ASSIGN <data-ref, name>.
4618    2. AGGR_TYPE: the type of the reference, which should be either a vector
4619         or an array.
4620    3. AT_LOOP: the loop where the vector memref is to be created.
4621    4. OFFSET (optional): an offset to be added to the initial address accessed
4622         by the data-ref in STMT.
4623    5. BSI: location where the new stmts are to be placed if there is no loop
4624    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4625         pointing to the initial address.
4626    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4627         to the initial address accessed by the data-ref in STMT.  This is
4628         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4629         in bytes.
4630    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4631         to the IV during each iteration of the loop.  NULL says to move
4632         by one copy of AGGR_TYPE up or down, depending on the step of the
4633         data reference.
4634
4635    Output:
4636    1. Declare a new ptr to vector_type, and have it point to the base of the
4637       data reference (initial addressed accessed by the data reference).
4638       For example, for vector of type V8HI, the following code is generated:
4639
4640       v8hi *ap;
4641       ap = (v8hi *)initial_address;
4642
4643       if OFFSET is not supplied:
4644          initial_address = &a[init];
4645       if OFFSET is supplied:
4646          initial_address = &a[init + OFFSET];
4647       if BYTE_OFFSET is supplied:
4648          initial_address = &a[init] + BYTE_OFFSET;
4649
4650       Return the initial_address in INITIAL_ADDRESS.
4651
4652    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4653       update the pointer in each iteration of the loop.
4654
4655       Return the increment stmt that updates the pointer in PTR_INCR.
4656
4657    3. Set INV_P to true if the access pattern of the data reference in the
4658       vectorized loop is invariant.  Set it to false otherwise.
4659
4660    4. Return the pointer.  */
4661
4662 tree
4663 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4664                           tree offset, tree *initial_address,
4665                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4666                           bool only_init, bool *inv_p, tree byte_offset,
4667                           tree iv_step)
4668 {
4669   const char *base_name;
4670   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4671   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4672   struct loop *loop = NULL;
4673   bool nested_in_vect_loop = false;
4674   struct loop *containing_loop = NULL;
4675   tree aggr_ptr_type;
4676   tree aggr_ptr;
4677   tree new_temp;
4678   gimple_seq new_stmt_list = NULL;
4679   edge pe = NULL;
4680   basic_block new_bb;
4681   tree aggr_ptr_init;
4682   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4683   tree aptr;
4684   gimple_stmt_iterator incr_gsi;
4685   bool insert_after;
4686   tree indx_before_incr, indx_after_incr;
4687   gimple *incr;
4688   tree step;
4689   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4690
4691   gcc_assert (iv_step != NULL_TREE
4692               || TREE_CODE (aggr_type) == ARRAY_TYPE
4693               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4694
4695   if (loop_vinfo)
4696     {
4697       loop = LOOP_VINFO_LOOP (loop_vinfo);
4698       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4699       containing_loop = (gimple_bb (stmt))->loop_father;
4700       pe = loop_preheader_edge (loop);
4701     }
4702   else
4703     {
4704       gcc_assert (bb_vinfo);
4705       only_init = true;
4706       *ptr_incr = NULL;
4707     }
4708
4709   /* Check the step (evolution) of the load in LOOP, and record
4710      whether it's invariant.  */
4711   step = vect_dr_behavior (dr)->step;
4712   if (integer_zerop (step))
4713     *inv_p = true;
4714   else
4715     *inv_p = false;
4716
4717   /* Create an expression for the first address accessed by this load
4718      in LOOP.  */
4719   base_name = get_name (DR_BASE_ADDRESS (dr));
4720
4721   if (dump_enabled_p ())
4722     {
4723       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4724       dump_printf_loc (MSG_NOTE, vect_location,
4725                        "create %s-pointer variable to type: ",
4726                        get_tree_code_name (TREE_CODE (aggr_type)));
4727       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4728       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4729         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4730       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4731         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4732       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4733         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4734       else
4735         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4736       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4737       dump_printf (MSG_NOTE, "\n");
4738     }
4739
4740   /* (1) Create the new aggregate-pointer variable.
4741      Vector and array types inherit the alias set of their component
4742      type by default so we need to use a ref-all pointer if the data
4743      reference does not conflict with the created aggregated data
4744      reference because it is not addressable.  */
4745   bool need_ref_all = false;
4746   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4747                               get_alias_set (DR_REF (dr))))
4748     need_ref_all = true;
4749   /* Likewise for any of the data references in the stmt group.  */
4750   else if (DR_GROUP_SIZE (stmt_info) > 1)
4751     {
4752       gimple *orig_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info);
4753       do
4754         {
4755           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4756           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4757           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4758                                       get_alias_set (DR_REF (sdr))))
4759             {
4760               need_ref_all = true;
4761               break;
4762             }
4763           orig_stmt = DR_GROUP_NEXT_ELEMENT (sinfo);
4764         }
4765       while (orig_stmt);
4766     }
4767   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4768                                                need_ref_all);
4769   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4770
4771
4772   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4773      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4774      def-use update cycles for the pointer: one relative to the outer-loop
4775      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4776      to the inner-loop (which is the inner-most loop containing the dataref),
4777      and this is done be step (5) below.
4778
4779      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4780      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4781      redundant.  Steps (3),(4) create the following:
4782
4783         vp0 = &base_addr;
4784         LOOP:   vp1 = phi(vp0,vp2)
4785                 ...
4786                 ...
4787                 vp2 = vp1 + step
4788                 goto LOOP
4789
4790      If there is an inner-loop nested in loop, then step (5) will also be
4791      applied, and an additional update in the inner-loop will be created:
4792
4793         vp0 = &base_addr;
4794         LOOP:   vp1 = phi(vp0,vp2)
4795                 ...
4796         inner:     vp3 = phi(vp1,vp4)
4797                    vp4 = vp3 + inner_step
4798                    if () goto inner
4799                 ...
4800                 vp2 = vp1 + step
4801                 if () goto LOOP   */
4802
4803   /* (2) Calculate the initial address of the aggregate-pointer, and set
4804      the aggregate-pointer to point to it before the loop.  */
4805
4806   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4807
4808   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4809                                                    offset, byte_offset);
4810   if (new_stmt_list)
4811     {
4812       if (pe)
4813         {
4814           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4815           gcc_assert (!new_bb);
4816         }
4817       else
4818         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4819     }
4820
4821   *initial_address = new_temp;
4822   aggr_ptr_init = new_temp;
4823
4824   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4825      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4826      inner-loop nested in LOOP (during outer-loop vectorization).  */
4827
4828   /* No update in loop is required.  */
4829   if (only_init && (!loop_vinfo || at_loop == loop))
4830     aptr = aggr_ptr_init;
4831   else
4832     {
4833       if (iv_step == NULL_TREE)
4834         {
4835           /* The step of the aggregate pointer is the type size.  */
4836           iv_step = TYPE_SIZE_UNIT (aggr_type);
4837           /* One exception to the above is when the scalar step of the load in
4838              LOOP is zero. In this case the step here is also zero.  */
4839           if (*inv_p)
4840             iv_step = size_zero_node;
4841           else if (tree_int_cst_sgn (step) == -1)
4842             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4843         }
4844
4845       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4846
4847       create_iv (aggr_ptr_init,
4848                  fold_convert (aggr_ptr_type, iv_step),
4849                  aggr_ptr, loop, &incr_gsi, insert_after,
4850                  &indx_before_incr, &indx_after_incr);
4851       incr = gsi_stmt (incr_gsi);
4852       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4853
4854       /* Copy the points-to information if it exists. */
4855       if (DR_PTR_INFO (dr))
4856         {
4857           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4858           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4859         }
4860       if (ptr_incr)
4861         *ptr_incr = incr;
4862
4863       aptr = indx_before_incr;
4864     }
4865
4866   if (!nested_in_vect_loop || only_init)
4867     return aptr;
4868
4869
4870   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4871      nested in LOOP, if exists.  */
4872
4873   gcc_assert (nested_in_vect_loop);
4874   if (!only_init)
4875     {
4876       standard_iv_increment_position (containing_loop, &incr_gsi,
4877                                       &insert_after);
4878       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4879                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4880                  &indx_after_incr);
4881       incr = gsi_stmt (incr_gsi);
4882       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4883
4884       /* Copy the points-to information if it exists. */
4885       if (DR_PTR_INFO (dr))
4886         {
4887           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4888           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4889         }
4890       if (ptr_incr)
4891         *ptr_incr = incr;
4892
4893       return indx_before_incr;
4894     }
4895   else
4896     gcc_unreachable ();
4897 }
4898
4899
4900 /* Function bump_vector_ptr
4901
4902    Increment a pointer (to a vector type) by vector-size. If requested,
4903    i.e. if PTR-INCR is given, then also connect the new increment stmt
4904    to the existing def-use update-chain of the pointer, by modifying
4905    the PTR_INCR as illustrated below:
4906
4907    The pointer def-use update-chain before this function:
4908                         DATAREF_PTR = phi (p_0, p_2)
4909                         ....
4910         PTR_INCR:       p_2 = DATAREF_PTR + step
4911
4912    The pointer def-use update-chain after this function:
4913                         DATAREF_PTR = phi (p_0, p_2)
4914                         ....
4915                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4916                         ....
4917         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4918
4919    Input:
4920    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4921                  in the loop.
4922    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4923               the loop.  The increment amount across iterations is expected
4924               to be vector_size.
4925    BSI - location where the new update stmt is to be placed.
4926    STMT - the original scalar memory-access stmt that is being vectorized.
4927    BUMP - optional. The offset by which to bump the pointer. If not given,
4928           the offset is assumed to be vector_size.
4929
4930    Output: Return NEW_DATAREF_PTR as illustrated above.
4931
4932 */
4933
4934 tree
4935 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4936                  gimple *stmt, tree bump)
4937 {
4938   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4939   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4940   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4941   tree update = TYPE_SIZE_UNIT (vectype);
4942   gassign *incr_stmt;
4943   ssa_op_iter iter;
4944   use_operand_p use_p;
4945   tree new_dataref_ptr;
4946
4947   if (bump)
4948     update = bump;
4949
4950   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4951     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4952   else
4953     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4954   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4955                                    dataref_ptr, update);
4956   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4957
4958   /* Copy the points-to information if it exists. */
4959   if (DR_PTR_INFO (dr))
4960     {
4961       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4962       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4963     }
4964
4965   if (!ptr_incr)
4966     return new_dataref_ptr;
4967
4968   /* Update the vector-pointer's cross-iteration increment.  */
4969   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4970     {
4971       tree use = USE_FROM_PTR (use_p);
4972
4973       if (use == dataref_ptr)
4974         SET_USE (use_p, new_dataref_ptr);
4975       else
4976         gcc_assert (operand_equal_p (use, update, 0));
4977     }
4978
4979   return new_dataref_ptr;
4980 }
4981
4982
4983 /* Copy memory reference info such as base/clique from the SRC reference
4984    to the DEST MEM_REF.  */
4985
4986 void
4987 vect_copy_ref_info (tree dest, tree src)
4988 {
4989   if (TREE_CODE (dest) != MEM_REF)
4990     return;
4991
4992   tree src_base = src;
4993   while (handled_component_p (src_base))
4994     src_base = TREE_OPERAND (src_base, 0);
4995   if (TREE_CODE (src_base) != MEM_REF
4996       && TREE_CODE (src_base) != TARGET_MEM_REF)
4997     return;
4998
4999   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5000   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5001 }
5002
5003
5004 /* Function vect_create_destination_var.
5005
5006    Create a new temporary of type VECTYPE.  */
5007
5008 tree
5009 vect_create_destination_var (tree scalar_dest, tree vectype)
5010 {
5011   tree vec_dest;
5012   const char *name;
5013   char *new_name;
5014   tree type;
5015   enum vect_var_kind kind;
5016
5017   kind = vectype
5018     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5019     ? vect_mask_var
5020     : vect_simple_var
5021     : vect_scalar_var;
5022   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5023
5024   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5025
5026   name = get_name (scalar_dest);
5027   if (name)
5028     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5029   else
5030     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5031   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5032   free (new_name);
5033
5034   return vec_dest;
5035 }
5036
5037 /* Function vect_grouped_store_supported.
5038
5039    Returns TRUE if interleave high and interleave low permutations
5040    are supported, and FALSE otherwise.  */
5041
5042 bool
5043 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5044 {
5045   machine_mode mode = TYPE_MODE (vectype);
5046
5047   /* vect_permute_store_chain requires the group size to be equal to 3 or
5048      be a power of two.  */
5049   if (count != 3 && exact_log2 (count) == -1)
5050     {
5051       if (dump_enabled_p ())
5052         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5053                          "the size of the group of accesses"
5054                          " is not a power of 2 or not eqaul to 3\n");
5055       return false;
5056     }
5057
5058   /* Check that the permutation is supported.  */
5059   if (VECTOR_MODE_P (mode))
5060     {
5061       unsigned int i;
5062       if (count == 3)
5063         {
5064           unsigned int j0 = 0, j1 = 0, j2 = 0;
5065           unsigned int i, j;
5066
5067           unsigned int nelt;
5068           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5069             {
5070               if (dump_enabled_p ())
5071                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072                                  "cannot handle groups of 3 stores for"
5073                                  " variable-length vectors\n");
5074               return false;
5075             }
5076
5077           vec_perm_builder sel (nelt, nelt, 1);
5078           sel.quick_grow (nelt);
5079           vec_perm_indices indices;
5080           for (j = 0; j < 3; j++)
5081             {
5082               int nelt0 = ((3 - j) * nelt) % 3;
5083               int nelt1 = ((3 - j) * nelt + 1) % 3;
5084               int nelt2 = ((3 - j) * nelt + 2) % 3;
5085               for (i = 0; i < nelt; i++)
5086                 {
5087                   if (3 * i + nelt0 < nelt)
5088                     sel[3 * i + nelt0] = j0++;
5089                   if (3 * i + nelt1 < nelt)
5090                     sel[3 * i + nelt1] = nelt + j1++;
5091                   if (3 * i + nelt2 < nelt)
5092                     sel[3 * i + nelt2] = 0;
5093                 }
5094               indices.new_vector (sel, 2, nelt);
5095               if (!can_vec_perm_const_p (mode, indices))
5096                 {
5097                   if (dump_enabled_p ())
5098                     dump_printf (MSG_MISSED_OPTIMIZATION,
5099                                  "permutation op not supported by target.\n");
5100                   return false;
5101                 }
5102
5103               for (i = 0; i < nelt; i++)
5104                 {
5105                   if (3 * i + nelt0 < nelt)
5106                     sel[3 * i + nelt0] = 3 * i + nelt0;
5107                   if (3 * i + nelt1 < nelt)
5108                     sel[3 * i + nelt1] = 3 * i + nelt1;
5109                   if (3 * i + nelt2 < nelt)
5110                     sel[3 * i + nelt2] = nelt + j2++;
5111                 }
5112               indices.new_vector (sel, 2, nelt);
5113               if (!can_vec_perm_const_p (mode, indices))
5114                 {
5115                   if (dump_enabled_p ())
5116                     dump_printf (MSG_MISSED_OPTIMIZATION,
5117                                  "permutation op not supported by target.\n");
5118                   return false;
5119                 }
5120             }
5121           return true;
5122         }
5123       else
5124         {
5125           /* If length is not equal to 3 then only power of 2 is supported.  */
5126           gcc_assert (pow2p_hwi (count));
5127           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5128
5129           /* The encoding has 2 interleaved stepped patterns.  */
5130           vec_perm_builder sel (nelt, 2, 3);
5131           sel.quick_grow (6);
5132           for (i = 0; i < 3; i++)
5133             {
5134               sel[i * 2] = i;
5135               sel[i * 2 + 1] = i + nelt;
5136             }
5137           vec_perm_indices indices (sel, 2, nelt);
5138           if (can_vec_perm_const_p (mode, indices))
5139             {
5140               for (i = 0; i < 6; i++)
5141                 sel[i] += exact_div (nelt, 2);
5142               indices.new_vector (sel, 2, nelt);
5143               if (can_vec_perm_const_p (mode, indices))
5144                 return true;
5145             }
5146         }
5147     }
5148
5149   if (dump_enabled_p ())
5150     dump_printf (MSG_MISSED_OPTIMIZATION,
5151                  "permutaion op not supported by target.\n");
5152   return false;
5153 }
5154
5155
5156 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5157    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5158
5159 bool
5160 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5161                             bool masked_p)
5162 {
5163   if (masked_p)
5164     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5165                                          vec_mask_store_lanes_optab,
5166                                          vectype, count);
5167   else
5168     return vect_lanes_optab_supported_p ("vec_store_lanes",
5169                                          vec_store_lanes_optab,
5170                                          vectype, count);
5171 }
5172
5173
5174 /* Function vect_permute_store_chain.
5175
5176    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5177    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5178    the data correctly for the stores.  Return the final references for stores
5179    in RESULT_CHAIN.
5180
5181    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5182    The input is 4 vectors each containing 8 elements.  We assign a number to
5183    each element, the input sequence is:
5184
5185    1st vec:   0  1  2  3  4  5  6  7
5186    2nd vec:   8  9 10 11 12 13 14 15
5187    3rd vec:  16 17 18 19 20 21 22 23
5188    4th vec:  24 25 26 27 28 29 30 31
5189
5190    The output sequence should be:
5191
5192    1st vec:  0  8 16 24  1  9 17 25
5193    2nd vec:  2 10 18 26  3 11 19 27
5194    3rd vec:  4 12 20 28  5 13 21 30
5195    4th vec:  6 14 22 30  7 15 23 31
5196
5197    i.e., we interleave the contents of the four vectors in their order.
5198
5199    We use interleave_high/low instructions to create such output.  The input of
5200    each interleave_high/low operation is two vectors:
5201    1st vec    2nd vec
5202    0 1 2 3    4 5 6 7
5203    the even elements of the result vector are obtained left-to-right from the
5204    high/low elements of the first vector.  The odd elements of the result are
5205    obtained left-to-right from the high/low elements of the second vector.
5206    The output of interleave_high will be:   0 4 1 5
5207    and of interleave_low:                   2 6 3 7
5208
5209
5210    The permutation is done in log LENGTH stages.  In each stage interleave_high
5211    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5212    where the first argument is taken from the first half of DR_CHAIN and the
5213    second argument from it's second half.
5214    In our example,
5215
5216    I1: interleave_high (1st vec, 3rd vec)
5217    I2: interleave_low (1st vec, 3rd vec)
5218    I3: interleave_high (2nd vec, 4th vec)
5219    I4: interleave_low (2nd vec, 4th vec)
5220
5221    The output for the first stage is:
5222
5223    I1:  0 16  1 17  2 18  3 19
5224    I2:  4 20  5 21  6 22  7 23
5225    I3:  8 24  9 25 10 26 11 27
5226    I4: 12 28 13 29 14 30 15 31
5227
5228    The output of the second stage, i.e. the final result is:
5229
5230    I1:  0  8 16 24  1  9 17 25
5231    I2:  2 10 18 26  3 11 19 27
5232    I3:  4 12 20 28  5 13 21 30
5233    I4:  6 14 22 30  7 15 23 31.  */
5234
5235 void
5236 vect_permute_store_chain (vec<tree> dr_chain,
5237                           unsigned int length,
5238                           gimple *stmt,
5239                           gimple_stmt_iterator *gsi,
5240                           vec<tree> *result_chain)
5241 {
5242   tree vect1, vect2, high, low;
5243   gimple *perm_stmt;
5244   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5245   tree perm_mask_low, perm_mask_high;
5246   tree data_ref;
5247   tree perm3_mask_low, perm3_mask_high;
5248   unsigned int i, j, n, log_length = exact_log2 (length);
5249
5250   result_chain->quick_grow (length);
5251   memcpy (result_chain->address (), dr_chain.address (),
5252           length * sizeof (tree));
5253
5254   if (length == 3)
5255     {
5256       /* vect_grouped_store_supported ensures that this is constant.  */
5257       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5258       unsigned int j0 = 0, j1 = 0, j2 = 0;
5259
5260       vec_perm_builder sel (nelt, nelt, 1);
5261       sel.quick_grow (nelt);
5262       vec_perm_indices indices;
5263       for (j = 0; j < 3; j++)
5264         {
5265           int nelt0 = ((3 - j) * nelt) % 3;
5266           int nelt1 = ((3 - j) * nelt + 1) % 3;
5267           int nelt2 = ((3 - j) * nelt + 2) % 3;
5268
5269           for (i = 0; i < nelt; i++)
5270             {
5271               if (3 * i + nelt0 < nelt)
5272                 sel[3 * i + nelt0] = j0++;
5273               if (3 * i + nelt1 < nelt)
5274                 sel[3 * i + nelt1] = nelt + j1++;
5275               if (3 * i + nelt2 < nelt)
5276                 sel[3 * i + nelt2] = 0;
5277             }
5278           indices.new_vector (sel, 2, nelt);
5279           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5280
5281           for (i = 0; i < nelt; i++)
5282             {
5283               if (3 * i + nelt0 < nelt)
5284                 sel[3 * i + nelt0] = 3 * i + nelt0;
5285               if (3 * i + nelt1 < nelt)
5286                 sel[3 * i + nelt1] = 3 * i + nelt1;
5287               if (3 * i + nelt2 < nelt)
5288                 sel[3 * i + nelt2] = nelt + j2++;
5289             }
5290           indices.new_vector (sel, 2, nelt);
5291           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5292
5293           vect1 = dr_chain[0];
5294           vect2 = dr_chain[1];
5295
5296           /* Create interleaving stmt:
5297              low = VEC_PERM_EXPR <vect1, vect2,
5298                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5299                                    j + 2, nelt + j + 2, *, ...}>  */
5300           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5301           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5302                                            vect2, perm3_mask_low);
5303           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5304
5305           vect1 = data_ref;
5306           vect2 = dr_chain[2];
5307           /* Create interleaving stmt:
5308              low = VEC_PERM_EXPR <vect1, vect2,
5309                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5310                                    6, 7, nelt + j + 2, ...}>  */
5311           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5312           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5313                                            vect2, perm3_mask_high);
5314           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5315           (*result_chain)[j] = data_ref;
5316         }
5317     }
5318   else
5319     {
5320       /* If length is not equal to 3 then only power of 2 is supported.  */
5321       gcc_assert (pow2p_hwi (length));
5322
5323       /* The encoding has 2 interleaved stepped patterns.  */
5324       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5325       vec_perm_builder sel (nelt, 2, 3);
5326       sel.quick_grow (6);
5327       for (i = 0; i < 3; i++)
5328         {
5329           sel[i * 2] = i;
5330           sel[i * 2 + 1] = i + nelt;
5331         }
5332         vec_perm_indices indices (sel, 2, nelt);
5333         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5334
5335         for (i = 0; i < 6; i++)
5336           sel[i] += exact_div (nelt, 2);
5337         indices.new_vector (sel, 2, nelt);
5338         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5339
5340         for (i = 0, n = log_length; i < n; i++)
5341           {
5342             for (j = 0; j < length/2; j++)
5343               {
5344                 vect1 = dr_chain[j];
5345                 vect2 = dr_chain[j+length/2];
5346
5347                 /* Create interleaving stmt:
5348                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5349                                                         ...}>  */
5350                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5351                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5352                                                  vect2, perm_mask_high);
5353                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5354                 (*result_chain)[2*j] = high;
5355
5356                 /* Create interleaving stmt:
5357                    low = VEC_PERM_EXPR <vect1, vect2,
5358                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5359                                          ...}>  */
5360                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5361                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5362                                                  vect2, perm_mask_low);
5363                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5364                 (*result_chain)[2*j+1] = low;
5365               }
5366             memcpy (dr_chain.address (), result_chain->address (),
5367                     length * sizeof (tree));
5368           }
5369     }
5370 }
5371
5372 /* Function vect_setup_realignment
5373
5374    This function is called when vectorizing an unaligned load using
5375    the dr_explicit_realign[_optimized] scheme.
5376    This function generates the following code at the loop prolog:
5377
5378       p = initial_addr;
5379    x  msq_init = *(floor(p));   # prolog load
5380       realignment_token = call target_builtin;
5381     loop:
5382    x  msq = phi (msq_init, ---)
5383
5384    The stmts marked with x are generated only for the case of
5385    dr_explicit_realign_optimized.
5386
5387    The code above sets up a new (vector) pointer, pointing to the first
5388    location accessed by STMT, and a "floor-aligned" load using that pointer.
5389    It also generates code to compute the "realignment-token" (if the relevant
5390    target hook was defined), and creates a phi-node at the loop-header bb
5391    whose arguments are the result of the prolog-load (created by this
5392    function) and the result of a load that takes place in the loop (to be
5393    created by the caller to this function).
5394
5395    For the case of dr_explicit_realign_optimized:
5396    The caller to this function uses the phi-result (msq) to create the
5397    realignment code inside the loop, and sets up the missing phi argument,
5398    as follows:
5399     loop:
5400       msq = phi (msq_init, lsq)
5401       lsq = *(floor(p'));        # load in loop
5402       result = realign_load (msq, lsq, realignment_token);
5403
5404    For the case of dr_explicit_realign:
5405     loop:
5406       msq = *(floor(p));        # load in loop
5407       p' = p + (VS-1);
5408       lsq = *(floor(p'));       # load in loop
5409       result = realign_load (msq, lsq, realignment_token);
5410
5411    Input:
5412    STMT - (scalar) load stmt to be vectorized. This load accesses
5413           a memory location that may be unaligned.
5414    BSI - place where new code is to be inserted.
5415    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5416                               is used.
5417
5418    Output:
5419    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5420                        target hook, if defined.
5421    Return value - the result of the loop-header phi node.  */
5422
5423 tree
5424 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5425                         tree *realignment_token,
5426                         enum dr_alignment_support alignment_support_scheme,
5427                         tree init_addr,
5428                         struct loop **at_loop)
5429 {
5430   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5431   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5432   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5433   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5434   struct loop *loop = NULL;
5435   edge pe = NULL;
5436   tree scalar_dest = gimple_assign_lhs (stmt);
5437   tree vec_dest;
5438   gimple *inc;
5439   tree ptr;
5440   tree data_ref;
5441   basic_block new_bb;
5442   tree msq_init = NULL_TREE;
5443   tree new_temp;
5444   gphi *phi_stmt;
5445   tree msq = NULL_TREE;
5446   gimple_seq stmts = NULL;
5447   bool inv_p;
5448   bool compute_in_loop = false;
5449   bool nested_in_vect_loop = false;
5450   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5451   struct loop *loop_for_initial_load = NULL;
5452
5453   if (loop_vinfo)
5454     {
5455       loop = LOOP_VINFO_LOOP (loop_vinfo);
5456       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5457     }
5458
5459   gcc_assert (alignment_support_scheme == dr_explicit_realign
5460               || alignment_support_scheme == dr_explicit_realign_optimized);
5461
5462   /* We need to generate three things:
5463      1. the misalignment computation
5464      2. the extra vector load (for the optimized realignment scheme).
5465      3. the phi node for the two vectors from which the realignment is
5466       done (for the optimized realignment scheme).  */
5467
5468   /* 1. Determine where to generate the misalignment computation.
5469
5470      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5471      calculation will be generated by this function, outside the loop (in the
5472      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5473      caller, inside the loop.
5474
5475      Background: If the misalignment remains fixed throughout the iterations of
5476      the loop, then both realignment schemes are applicable, and also the
5477      misalignment computation can be done outside LOOP.  This is because we are
5478      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5479      are a multiple of VS (the Vector Size), and therefore the misalignment in
5480      different vectorized LOOP iterations is always the same.
5481      The problem arises only if the memory access is in an inner-loop nested
5482      inside LOOP, which is now being vectorized using outer-loop vectorization.
5483      This is the only case when the misalignment of the memory access may not
5484      remain fixed throughout the iterations of the inner-loop (as explained in
5485      detail in vect_supportable_dr_alignment).  In this case, not only is the
5486      optimized realignment scheme not applicable, but also the misalignment
5487      computation (and generation of the realignment token that is passed to
5488      REALIGN_LOAD) have to be done inside the loop.
5489
5490      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5491      or not, which in turn determines if the misalignment is computed inside
5492      the inner-loop, or outside LOOP.  */
5493
5494   if (init_addr != NULL_TREE || !loop_vinfo)
5495     {
5496       compute_in_loop = true;
5497       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5498     }
5499
5500
5501   /* 2. Determine where to generate the extra vector load.
5502
5503      For the optimized realignment scheme, instead of generating two vector
5504      loads in each iteration, we generate a single extra vector load in the
5505      preheader of the loop, and in each iteration reuse the result of the
5506      vector load from the previous iteration.  In case the memory access is in
5507      an inner-loop nested inside LOOP, which is now being vectorized using
5508      outer-loop vectorization, we need to determine whether this initial vector
5509      load should be generated at the preheader of the inner-loop, or can be
5510      generated at the preheader of LOOP.  If the memory access has no evolution
5511      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5512      to be generated inside LOOP (in the preheader of the inner-loop).  */
5513
5514   if (nested_in_vect_loop)
5515     {
5516       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5517       bool invariant_in_outerloop =
5518             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5519       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5520     }
5521   else
5522     loop_for_initial_load = loop;
5523   if (at_loop)
5524     *at_loop = loop_for_initial_load;
5525
5526   if (loop_for_initial_load)
5527     pe = loop_preheader_edge (loop_for_initial_load);
5528
5529   /* 3. For the case of the optimized realignment, create the first vector
5530       load at the loop preheader.  */
5531
5532   if (alignment_support_scheme == dr_explicit_realign_optimized)
5533     {
5534       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5535       gassign *new_stmt;
5536
5537       gcc_assert (!compute_in_loop);
5538       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5539       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5540                                       NULL_TREE, &init_addr, NULL, &inc,
5541                                       true, &inv_p);
5542       if (TREE_CODE (ptr) == SSA_NAME)
5543         new_temp = copy_ssa_name (ptr);
5544       else
5545         new_temp = make_ssa_name (TREE_TYPE (ptr));
5546       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5547       new_stmt = gimple_build_assign
5548                    (new_temp, BIT_AND_EXPR, ptr,
5549                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5550       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5551       gcc_assert (!new_bb);
5552       data_ref
5553         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5554                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5555       vect_copy_ref_info (data_ref, DR_REF (dr));
5556       new_stmt = gimple_build_assign (vec_dest, data_ref);
5557       new_temp = make_ssa_name (vec_dest, new_stmt);
5558       gimple_assign_set_lhs (new_stmt, new_temp);
5559       if (pe)
5560         {
5561           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5562           gcc_assert (!new_bb);
5563         }
5564       else
5565          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5566
5567       msq_init = gimple_assign_lhs (new_stmt);
5568     }
5569
5570   /* 4. Create realignment token using a target builtin, if available.
5571       It is done either inside the containing loop, or before LOOP (as
5572       determined above).  */
5573
5574   if (targetm.vectorize.builtin_mask_for_load)
5575     {
5576       gcall *new_stmt;
5577       tree builtin_decl;
5578
5579       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5580       if (!init_addr)
5581         {
5582           /* Generate the INIT_ADDR computation outside LOOP.  */
5583           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5584                                                             NULL_TREE);
5585           if (loop)
5586             {
5587               pe = loop_preheader_edge (loop);
5588               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5589               gcc_assert (!new_bb);
5590             }
5591           else
5592              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5593         }
5594
5595       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5596       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5597       vec_dest =
5598         vect_create_destination_var (scalar_dest,
5599                                      gimple_call_return_type (new_stmt));
5600       new_temp = make_ssa_name (vec_dest, new_stmt);
5601       gimple_call_set_lhs (new_stmt, new_temp);
5602
5603       if (compute_in_loop)
5604         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5605       else
5606         {
5607           /* Generate the misalignment computation outside LOOP.  */
5608           pe = loop_preheader_edge (loop);
5609           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5610           gcc_assert (!new_bb);
5611         }
5612
5613       *realignment_token = gimple_call_lhs (new_stmt);
5614
5615       /* The result of the CALL_EXPR to this builtin is determined from
5616          the value of the parameter and no global variables are touched
5617          which makes the builtin a "const" function.  Requiring the
5618          builtin to have the "const" attribute makes it unnecessary
5619          to call mark_call_clobbered.  */
5620       gcc_assert (TREE_READONLY (builtin_decl));
5621     }
5622
5623   if (alignment_support_scheme == dr_explicit_realign)
5624     return msq;
5625
5626   gcc_assert (!compute_in_loop);
5627   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5628
5629
5630   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5631
5632   pe = loop_preheader_edge (containing_loop);
5633   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5634   msq = make_ssa_name (vec_dest);
5635   phi_stmt = create_phi_node (msq, containing_loop->header);
5636   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5637
5638   return msq;
5639 }
5640
5641
5642 /* Function vect_grouped_load_supported.
5643
5644    COUNT is the size of the load group (the number of statements plus the
5645    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5646    only one statement, with a gap of COUNT - 1.
5647
5648    Returns true if a suitable permute exists.  */
5649
5650 bool
5651 vect_grouped_load_supported (tree vectype, bool single_element_p,
5652                              unsigned HOST_WIDE_INT count)
5653 {
5654   machine_mode mode = TYPE_MODE (vectype);
5655
5656   /* If this is single-element interleaving with an element distance
5657      that leaves unused vector loads around punt - we at least create
5658      very sub-optimal code in that case (and blow up memory,
5659      see PR65518).  */
5660   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5661     {
5662       if (dump_enabled_p ())
5663         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5664                          "single-element interleaving not supported "
5665                          "for not adjacent vector loads\n");
5666       return false;
5667     }
5668
5669   /* vect_permute_load_chain requires the group size to be equal to 3 or
5670      be a power of two.  */
5671   if (count != 3 && exact_log2 (count) == -1)
5672     {
5673       if (dump_enabled_p ())
5674         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5675                          "the size of the group of accesses"
5676                          " is not a power of 2 or not equal to 3\n");
5677       return false;
5678     }
5679
5680   /* Check that the permutation is supported.  */
5681   if (VECTOR_MODE_P (mode))
5682     {
5683       unsigned int i, j;
5684       if (count == 3)
5685         {
5686           unsigned int nelt;
5687           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5688             {
5689               if (dump_enabled_p ())
5690                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5691                                  "cannot handle groups of 3 loads for"
5692                                  " variable-length vectors\n");
5693               return false;
5694             }
5695
5696           vec_perm_builder sel (nelt, nelt, 1);
5697           sel.quick_grow (nelt);
5698           vec_perm_indices indices;
5699           unsigned int k;
5700           for (k = 0; k < 3; k++)
5701             {
5702               for (i = 0; i < nelt; i++)
5703                 if (3 * i + k < 2 * nelt)
5704                   sel[i] = 3 * i + k;
5705                 else
5706                   sel[i] = 0;
5707               indices.new_vector (sel, 2, nelt);
5708               if (!can_vec_perm_const_p (mode, indices))
5709                 {
5710                   if (dump_enabled_p ())
5711                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712                                      "shuffle of 3 loads is not supported by"
5713                                      " target\n");
5714                   return false;
5715                 }
5716               for (i = 0, j = 0; i < nelt; i++)
5717                 if (3 * i + k < 2 * nelt)
5718                   sel[i] = i;
5719                 else
5720                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5721               indices.new_vector (sel, 2, nelt);
5722               if (!can_vec_perm_const_p (mode, indices))
5723                 {
5724                   if (dump_enabled_p ())
5725                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726                                      "shuffle of 3 loads is not supported by"
5727                                      " target\n");
5728                   return false;
5729                 }
5730             }
5731           return true;
5732         }
5733       else
5734         {
5735           /* If length is not equal to 3 then only power of 2 is supported.  */
5736           gcc_assert (pow2p_hwi (count));
5737           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5738
5739           /* The encoding has a single stepped pattern.  */
5740           vec_perm_builder sel (nelt, 1, 3);
5741           sel.quick_grow (3);
5742           for (i = 0; i < 3; i++)
5743             sel[i] = i * 2;
5744           vec_perm_indices indices (sel, 2, nelt);
5745           if (can_vec_perm_const_p (mode, indices))
5746             {
5747               for (i = 0; i < 3; i++)
5748                 sel[i] = i * 2 + 1;
5749               indices.new_vector (sel, 2, nelt);
5750               if (can_vec_perm_const_p (mode, indices))
5751                 return true;
5752             }
5753         }
5754     }
5755
5756   if (dump_enabled_p ())
5757     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5758                      "extract even/odd not supported by target\n");
5759   return false;
5760 }
5761
5762 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5763    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5764
5765 bool
5766 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5767                            bool masked_p)
5768 {
5769   if (masked_p)
5770     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5771                                          vec_mask_load_lanes_optab,
5772                                          vectype, count);
5773   else
5774     return vect_lanes_optab_supported_p ("vec_load_lanes",
5775                                          vec_load_lanes_optab,
5776                                          vectype, count);
5777 }
5778
5779 /* Function vect_permute_load_chain.
5780
5781    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5782    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5783    the input data correctly.  Return the final references for loads in
5784    RESULT_CHAIN.
5785
5786    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5787    The input is 4 vectors each containing 8 elements. We assign a number to each
5788    element, the input sequence is:
5789
5790    1st vec:   0  1  2  3  4  5  6  7
5791    2nd vec:   8  9 10 11 12 13 14 15
5792    3rd vec:  16 17 18 19 20 21 22 23
5793    4th vec:  24 25 26 27 28 29 30 31
5794
5795    The output sequence should be:
5796
5797    1st vec:  0 4  8 12 16 20 24 28
5798    2nd vec:  1 5  9 13 17 21 25 29
5799    3rd vec:  2 6 10 14 18 22 26 30
5800    4th vec:  3 7 11 15 19 23 27 31
5801
5802    i.e., the first output vector should contain the first elements of each
5803    interleaving group, etc.
5804
5805    We use extract_even/odd instructions to create such output.  The input of
5806    each extract_even/odd operation is two vectors
5807    1st vec    2nd vec
5808    0 1 2 3    4 5 6 7
5809
5810    and the output is the vector of extracted even/odd elements.  The output of
5811    extract_even will be:   0 2 4 6
5812    and of extract_odd:     1 3 5 7
5813
5814
5815    The permutation is done in log LENGTH stages.  In each stage extract_even
5816    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5817    their order.  In our example,
5818
5819    E1: extract_even (1st vec, 2nd vec)
5820    E2: extract_odd (1st vec, 2nd vec)
5821    E3: extract_even (3rd vec, 4th vec)
5822    E4: extract_odd (3rd vec, 4th vec)
5823
5824    The output for the first stage will be:
5825
5826    E1:  0  2  4  6  8 10 12 14
5827    E2:  1  3  5  7  9 11 13 15
5828    E3: 16 18 20 22 24 26 28 30
5829    E4: 17 19 21 23 25 27 29 31
5830
5831    In order to proceed and create the correct sequence for the next stage (or
5832    for the correct output, if the second stage is the last one, as in our
5833    example), we first put the output of extract_even operation and then the
5834    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5835    The input for the second stage is:
5836
5837    1st vec (E1):  0  2  4  6  8 10 12 14
5838    2nd vec (E3): 16 18 20 22 24 26 28 30
5839    3rd vec (E2):  1  3  5  7  9 11 13 15
5840    4th vec (E4): 17 19 21 23 25 27 29 31
5841
5842    The output of the second stage:
5843
5844    E1: 0 4  8 12 16 20 24 28
5845    E2: 2 6 10 14 18 22 26 30
5846    E3: 1 5  9 13 17 21 25 29
5847    E4: 3 7 11 15 19 23 27 31
5848
5849    And RESULT_CHAIN after reordering:
5850
5851    1st vec (E1):  0 4  8 12 16 20 24 28
5852    2nd vec (E3):  1 5  9 13 17 21 25 29
5853    3rd vec (E2):  2 6 10 14 18 22 26 30
5854    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5855
5856 static void
5857 vect_permute_load_chain (vec<tree> dr_chain,
5858                          unsigned int length,
5859                          gimple *stmt,
5860                          gimple_stmt_iterator *gsi,
5861                          vec<tree> *result_chain)
5862 {
5863   tree data_ref, first_vect, second_vect;
5864   tree perm_mask_even, perm_mask_odd;
5865   tree perm3_mask_low, perm3_mask_high;
5866   gimple *perm_stmt;
5867   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5868   unsigned int i, j, log_length = exact_log2 (length);
5869
5870   result_chain->quick_grow (length);
5871   memcpy (result_chain->address (), dr_chain.address (),
5872           length * sizeof (tree));
5873
5874   if (length == 3)
5875     {
5876       /* vect_grouped_load_supported ensures that this is constant.  */
5877       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5878       unsigned int k;
5879
5880       vec_perm_builder sel (nelt, nelt, 1);
5881       sel.quick_grow (nelt);
5882       vec_perm_indices indices;
5883       for (k = 0; k < 3; k++)
5884         {
5885           for (i = 0; i < nelt; i++)
5886             if (3 * i + k < 2 * nelt)
5887               sel[i] = 3 * i + k;
5888             else
5889               sel[i] = 0;
5890           indices.new_vector (sel, 2, nelt);
5891           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5892
5893           for (i = 0, j = 0; i < nelt; i++)
5894             if (3 * i + k < 2 * nelt)
5895               sel[i] = i;
5896             else
5897               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5898           indices.new_vector (sel, 2, nelt);
5899           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5900
5901           first_vect = dr_chain[0];
5902           second_vect = dr_chain[1];
5903
5904           /* Create interleaving stmt (low part of):
5905              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5906                                                              ...}>  */
5907           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5908           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5909                                            second_vect, perm3_mask_low);
5910           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5911
5912           /* Create interleaving stmt (high part of):
5913              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5914                                                               ...}>  */
5915           first_vect = data_ref;
5916           second_vect = dr_chain[2];
5917           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5918           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5919                                            second_vect, perm3_mask_high);
5920           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5921           (*result_chain)[k] = data_ref;
5922         }
5923     }
5924   else
5925     {
5926       /* If length is not equal to 3 then only power of 2 is supported.  */
5927       gcc_assert (pow2p_hwi (length));
5928
5929       /* The encoding has a single stepped pattern.  */
5930       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5931       vec_perm_builder sel (nelt, 1, 3);
5932       sel.quick_grow (3);
5933       for (i = 0; i < 3; ++i)
5934         sel[i] = i * 2;
5935       vec_perm_indices indices (sel, 2, nelt);
5936       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5937
5938       for (i = 0; i < 3; ++i)
5939         sel[i] = i * 2 + 1;
5940       indices.new_vector (sel, 2, nelt);
5941       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5942
5943       for (i = 0; i < log_length; i++)
5944         {
5945           for (j = 0; j < length; j += 2)
5946             {
5947               first_vect = dr_chain[j];
5948               second_vect = dr_chain[j+1];
5949
5950               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5951               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5952               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5953                                                first_vect, second_vect,
5954                                                perm_mask_even);
5955               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5956               (*result_chain)[j/2] = data_ref;
5957
5958               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5959               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5960               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5961                                                first_vect, second_vect,
5962                                                perm_mask_odd);
5963               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5964               (*result_chain)[j/2+length/2] = data_ref;
5965             }
5966           memcpy (dr_chain.address (), result_chain->address (),
5967                   length * sizeof (tree));
5968         }
5969     }
5970 }
5971
5972 /* Function vect_shift_permute_load_chain.
5973
5974    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5975    sequence of stmts to reorder the input data accordingly.
5976    Return the final references for loads in RESULT_CHAIN.
5977    Return true if successed, false otherwise.
5978
5979    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5980    The input is 3 vectors each containing 8 elements.  We assign a
5981    number to each element, the input sequence is:
5982
5983    1st vec:   0  1  2  3  4  5  6  7
5984    2nd vec:   8  9 10 11 12 13 14 15
5985    3rd vec:  16 17 18 19 20 21 22 23
5986
5987    The output sequence should be:
5988
5989    1st vec:  0 3 6  9 12 15 18 21
5990    2nd vec:  1 4 7 10 13 16 19 22
5991    3rd vec:  2 5 8 11 14 17 20 23
5992
5993    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5994
5995    First we shuffle all 3 vectors to get correct elements order:
5996
5997    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5998    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5999    3rd vec:  (16 19 22) (17 20 23) (18 21)
6000
6001    Next we unite and shift vector 3 times:
6002
6003    1st step:
6004      shift right by 6 the concatenation of:
6005      "1st vec" and  "2nd vec"
6006        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6007      "2nd vec" and  "3rd vec"
6008        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6009      "3rd vec" and  "1st vec"
6010        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6011                              | New vectors                   |
6012
6013      So that now new vectors are:
6014
6015      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6016      2nd vec:  (10 13) (16 19 22) (17 20 23)
6017      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6018
6019    2nd step:
6020      shift right by 5 the concatenation of:
6021      "1st vec" and  "3rd vec"
6022        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6023      "2nd vec" and  "1st vec"
6024        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6025      "3rd vec" and  "2nd vec"
6026        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6027                           | New vectors                   |
6028
6029      So that now new vectors are:
6030
6031      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6032      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6033      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6034
6035    3rd step:
6036      shift right by 5 the concatenation of:
6037      "1st vec" and  "1st vec"
6038        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6039      shift right by 3 the concatenation of:
6040      "2nd vec" and  "2nd vec"
6041                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6042                           | New vectors                   |
6043
6044      So that now all vectors are READY:
6045      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6046      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6047      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6048
6049    This algorithm is faster than one in vect_permute_load_chain if:
6050      1.  "shift of a concatination" is faster than general permutation.
6051          This is usually so.
6052      2.  The TARGET machine can't execute vector instructions in parallel.
6053          This is because each step of the algorithm depends on previous.
6054          The algorithm in vect_permute_load_chain is much more parallel.
6055
6056    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6057 */
6058
6059 static bool
6060 vect_shift_permute_load_chain (vec<tree> dr_chain,
6061                                unsigned int length,
6062                                gimple *stmt,
6063                                gimple_stmt_iterator *gsi,
6064                                vec<tree> *result_chain)
6065 {
6066   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6067   tree perm2_mask1, perm2_mask2, perm3_mask;
6068   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6069   gimple *perm_stmt;
6070
6071   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
6072   unsigned int i;
6073   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6074   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6075
6076   unsigned HOST_WIDE_INT nelt, vf;
6077   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6078       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6079     /* Not supported for variable-length vectors.  */
6080     return false;
6081
6082   vec_perm_builder sel (nelt, nelt, 1);
6083   sel.quick_grow (nelt);
6084
6085   result_chain->quick_grow (length);
6086   memcpy (result_chain->address (), dr_chain.address (),
6087           length * sizeof (tree));
6088
6089   if (pow2p_hwi (length) && vf > 4)
6090     {
6091       unsigned int j, log_length = exact_log2 (length);
6092       for (i = 0; i < nelt / 2; ++i)
6093         sel[i] = i * 2;
6094       for (i = 0; i < nelt / 2; ++i)
6095         sel[nelt / 2 + i] = i * 2 + 1;
6096       vec_perm_indices indices (sel, 2, nelt);
6097       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6098         {
6099           if (dump_enabled_p ())
6100             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6101                              "shuffle of 2 fields structure is not \
6102                               supported by target\n");
6103           return false;
6104         }
6105       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6106
6107       for (i = 0; i < nelt / 2; ++i)
6108         sel[i] = i * 2 + 1;
6109       for (i = 0; i < nelt / 2; ++i)
6110         sel[nelt / 2 + i] = i * 2;
6111       indices.new_vector (sel, 2, nelt);
6112       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6113         {
6114           if (dump_enabled_p ())
6115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                              "shuffle of 2 fields structure is not \
6117                               supported by target\n");
6118           return false;
6119         }
6120       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6121
6122       /* Generating permutation constant to shift all elements.
6123          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6124       for (i = 0; i < nelt; i++)
6125         sel[i] = nelt / 2 + i;
6126       indices.new_vector (sel, 2, nelt);
6127       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6128         {
6129           if (dump_enabled_p ())
6130             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131                              "shift permutation is not supported by target\n");
6132           return false;
6133         }
6134       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6135
6136       /* Generating permutation constant to select vector from 2.
6137          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6138       for (i = 0; i < nelt / 2; i++)
6139         sel[i] = i;
6140       for (i = nelt / 2; i < nelt; i++)
6141         sel[i] = nelt + i;
6142       indices.new_vector (sel, 2, nelt);
6143       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6144         {
6145           if (dump_enabled_p ())
6146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147                              "select is not supported by target\n");
6148           return false;
6149         }
6150       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6151
6152       for (i = 0; i < log_length; i++)
6153         {
6154           for (j = 0; j < length; j += 2)
6155             {
6156               first_vect = dr_chain[j];
6157               second_vect = dr_chain[j + 1];
6158
6159               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6160               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6161                                                first_vect, first_vect,
6162                                                perm2_mask1);
6163               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6164               vect[0] = data_ref;
6165
6166               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6167               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6168                                                second_vect, second_vect,
6169                                                perm2_mask2);
6170               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6171               vect[1] = data_ref;
6172
6173               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6174               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6175                                                vect[0], vect[1], shift1_mask);
6176               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6177               (*result_chain)[j/2 + length/2] = data_ref;
6178
6179               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6180               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6181                                                vect[0], vect[1], select_mask);
6182               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6183               (*result_chain)[j/2] = data_ref;
6184             }
6185           memcpy (dr_chain.address (), result_chain->address (),
6186                   length * sizeof (tree));
6187         }
6188       return true;
6189     }
6190   if (length == 3 && vf > 2)
6191     {
6192       unsigned int k = 0, l = 0;
6193
6194       /* Generating permutation constant to get all elements in rigth order.
6195          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6196       for (i = 0; i < nelt; i++)
6197         {
6198           if (3 * k + (l % 3) >= nelt)
6199             {
6200               k = 0;
6201               l += (3 - (nelt % 3));
6202             }
6203           sel[i] = 3 * k + (l % 3);
6204           k++;
6205         }
6206       vec_perm_indices indices (sel, 2, nelt);
6207       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6208         {
6209           if (dump_enabled_p ())
6210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6211                              "shuffle of 3 fields structure is not \
6212                               supported by target\n");
6213           return false;
6214         }
6215       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6216
6217       /* Generating permutation constant to shift all elements.
6218          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6219       for (i = 0; i < nelt; i++)
6220         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6221       indices.new_vector (sel, 2, nelt);
6222       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6223         {
6224           if (dump_enabled_p ())
6225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226                              "shift permutation is not supported by target\n");
6227           return false;
6228         }
6229       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6230
6231       /* Generating permutation constant to shift all elements.
6232          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6233       for (i = 0; i < nelt; i++)
6234         sel[i] = 2 * (nelt / 3) + 1 + i;
6235       indices.new_vector (sel, 2, nelt);
6236       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6237         {
6238           if (dump_enabled_p ())
6239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6240                              "shift permutation is not supported by target\n");
6241           return false;
6242         }
6243       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6244
6245       /* Generating permutation constant to shift all elements.
6246          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6247       for (i = 0; i < nelt; i++)
6248         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6249       indices.new_vector (sel, 2, nelt);
6250       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6251         {
6252           if (dump_enabled_p ())
6253             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254                              "shift permutation is not supported by target\n");
6255           return false;
6256         }
6257       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6258
6259       /* Generating permutation constant to shift all elements.
6260          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6261       for (i = 0; i < nelt; i++)
6262         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6263       indices.new_vector (sel, 2, nelt);
6264       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6265         {
6266           if (dump_enabled_p ())
6267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6268                              "shift permutation is not supported by target\n");
6269           return false;
6270         }
6271       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6272
6273       for (k = 0; k < 3; k++)
6274         {
6275           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6276           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6277                                            dr_chain[k], dr_chain[k],
6278                                            perm3_mask);
6279           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6280           vect[k] = data_ref;
6281         }
6282
6283       for (k = 0; k < 3; k++)
6284         {
6285           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6286           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6287                                            vect[k % 3], vect[(k + 1) % 3],
6288                                            shift1_mask);
6289           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6290           vect_shift[k] = data_ref;
6291         }
6292
6293       for (k = 0; k < 3; k++)
6294         {
6295           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6296           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6297                                            vect_shift[(4 - k) % 3],
6298                                            vect_shift[(3 - k) % 3],
6299                                            shift2_mask);
6300           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6301           vect[k] = data_ref;
6302         }
6303
6304       (*result_chain)[3 - (nelt % 3)] = vect[2];
6305
6306       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6307       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6308                                        vect[0], shift3_mask);
6309       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6310       (*result_chain)[nelt % 3] = data_ref;
6311
6312       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6313       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6314                                        vect[1], shift4_mask);
6315       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6316       (*result_chain)[0] = data_ref;
6317       return true;
6318     }
6319   return false;
6320 }
6321
6322 /* Function vect_transform_grouped_load.
6323
6324    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6325    to perform their permutation and ascribe the result vectorized statements to
6326    the scalar statements.
6327 */
6328
6329 void
6330 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6331                              gimple_stmt_iterator *gsi)
6332 {
6333   machine_mode mode;
6334   vec<tree> result_chain = vNULL;
6335
6336   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6337      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6338      vectors, that are ready for vector computation.  */
6339   result_chain.create (size);
6340
6341   /* If reassociation width for vector type is 2 or greater target machine can
6342      execute 2 or more vector instructions in parallel.  Otherwise try to
6343      get chain for loads group using vect_shift_permute_load_chain.  */
6344   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6345   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6346       || pow2p_hwi (size)
6347       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6348                                          gsi, &result_chain))
6349     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6350   vect_record_grouped_load_vectors (stmt, result_chain);
6351   result_chain.release ();
6352 }
6353
6354 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6355    generated as part of the vectorization of STMT.  Assign the statement
6356    for each vector to the associated scalar statement.  */
6357
6358 void
6359 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6360 {
6361   gimple *first_stmt = DR_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6362   gimple *next_stmt, *new_stmt;
6363   unsigned int i, gap_count;
6364   tree tmp_data_ref;
6365
6366   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6367      Since we scan the chain starting from it's first node, their order
6368      corresponds the order of data-refs in RESULT_CHAIN.  */
6369   next_stmt = first_stmt;
6370   gap_count = 1;
6371   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6372     {
6373       if (!next_stmt)
6374         break;
6375
6376       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6377        code elimination pass later.  No need to check for the first stmt in
6378        the group, since it always exists.
6379        DR_GROUP_GAP is the number of steps in elements from the previous
6380        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6381        correspond to the gaps.  */
6382       if (next_stmt != first_stmt
6383           && gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt)))
6384       {
6385         gap_count++;
6386         continue;
6387       }
6388
6389       while (next_stmt)
6390         {
6391           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6392           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6393              copies, and we put the new vector statement in the first available
6394              RELATED_STMT.  */
6395           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6396             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6397           else
6398             {
6399               if (!DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6400                 {
6401                   gimple *prev_stmt =
6402                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6403                   gimple *rel_stmt =
6404                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6405                   while (rel_stmt)
6406                     {
6407                       prev_stmt = rel_stmt;
6408                       rel_stmt =
6409                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6410                     }
6411
6412                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6413                     new_stmt;
6414                 }
6415             }
6416
6417           next_stmt = DR_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6418           gap_count = 1;
6419           /* If NEXT_STMT accesses the same DR as the previous statement,
6420              put the same TMP_DATA_REF as its vectorized statement; otherwise
6421              get the next data-ref from RESULT_CHAIN.  */
6422           if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6423             break;
6424         }
6425     }
6426 }
6427
6428 /* Function vect_force_dr_alignment_p.
6429
6430    Returns whether the alignment of a DECL can be forced to be aligned
6431    on ALIGNMENT bit boundary.  */
6432
6433 bool
6434 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6435 {
6436   if (!VAR_P (decl))
6437     return false;
6438
6439   if (decl_in_symtab_p (decl)
6440       && !symtab_node::get (decl)->can_increase_alignment_p ())
6441     return false;
6442
6443   if (TREE_STATIC (decl))
6444     return (alignment <= MAX_OFILE_ALIGNMENT);
6445   else
6446     return (alignment <= MAX_STACK_ALIGNMENT);
6447 }
6448
6449
6450 /* Return whether the data reference DR is supported with respect to its
6451    alignment.
6452    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6453    it is aligned, i.e., check if it is possible to vectorize it with different
6454    alignment.  */
6455
6456 enum dr_alignment_support
6457 vect_supportable_dr_alignment (struct data_reference *dr,
6458                                bool check_aligned_accesses)
6459 {
6460   gimple *stmt = vect_dr_stmt (dr);
6461   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6462   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6463   machine_mode mode = TYPE_MODE (vectype);
6464   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6465   struct loop *vect_loop = NULL;
6466   bool nested_in_vect_loop = false;
6467
6468   if (aligned_access_p (dr) && !check_aligned_accesses)
6469     return dr_aligned;
6470
6471   /* For now assume all conditional loads/stores support unaligned
6472      access without any special code.  */
6473   if (is_gimple_call (stmt)
6474       && gimple_call_internal_p (stmt)
6475       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6476           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6477     return dr_unaligned_supported;
6478
6479   if (loop_vinfo)
6480     {
6481       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6482       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6483     }
6484
6485   /* Possibly unaligned access.  */
6486
6487   /* We can choose between using the implicit realignment scheme (generating
6488      a misaligned_move stmt) and the explicit realignment scheme (generating
6489      aligned loads with a REALIGN_LOAD).  There are two variants to the
6490      explicit realignment scheme: optimized, and unoptimized.
6491      We can optimize the realignment only if the step between consecutive
6492      vector loads is equal to the vector size.  Since the vector memory
6493      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6494      is guaranteed that the misalignment amount remains the same throughout the
6495      execution of the vectorized loop.  Therefore, we can create the
6496      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6497      at the loop preheader.
6498
6499      However, in the case of outer-loop vectorization, when vectorizing a
6500      memory access in the inner-loop nested within the LOOP that is now being
6501      vectorized, while it is guaranteed that the misalignment of the
6502      vectorized memory access will remain the same in different outer-loop
6503      iterations, it is *not* guaranteed that is will remain the same throughout
6504      the execution of the inner-loop.  This is because the inner-loop advances
6505      with the original scalar step (and not in steps of VS).  If the inner-loop
6506      step happens to be a multiple of VS, then the misalignment remains fixed
6507      and we can use the optimized realignment scheme.  For example:
6508
6509       for (i=0; i<N; i++)
6510         for (j=0; j<M; j++)
6511           s += a[i+j];
6512
6513      When vectorizing the i-loop in the above example, the step between
6514      consecutive vector loads is 1, and so the misalignment does not remain
6515      fixed across the execution of the inner-loop, and the realignment cannot
6516      be optimized (as illustrated in the following pseudo vectorized loop):
6517
6518       for (i=0; i<N; i+=4)
6519         for (j=0; j<M; j++){
6520           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6521                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6522                          // (assuming that we start from an aligned address).
6523           }
6524
6525      We therefore have to use the unoptimized realignment scheme:
6526
6527       for (i=0; i<N; i+=4)
6528           for (j=k; j<M; j+=4)
6529           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6530                            // that the misalignment of the initial address is
6531                            // 0).
6532
6533      The loop can then be vectorized as follows:
6534
6535       for (k=0; k<4; k++){
6536         rt = get_realignment_token (&vp[k]);
6537         for (i=0; i<N; i+=4){
6538           v1 = vp[i+k];
6539           for (j=k; j<M; j+=4){
6540             v2 = vp[i+j+VS-1];
6541             va = REALIGN_LOAD <v1,v2,rt>;
6542             vs += va;
6543             v1 = v2;
6544           }
6545         }
6546     } */
6547
6548   if (DR_IS_READ (dr))
6549     {
6550       bool is_packed = false;
6551       tree type = (TREE_TYPE (DR_REF (dr)));
6552
6553       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6554           && (!targetm.vectorize.builtin_mask_for_load
6555               || targetm.vectorize.builtin_mask_for_load ()))
6556         {
6557           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6558
6559           /* If we are doing SLP then the accesses need not have the
6560              same alignment, instead it depends on the SLP group size.  */
6561           if (loop_vinfo
6562               && STMT_SLP_TYPE (stmt_info)
6563               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6564                               * DR_GROUP_SIZE (vinfo_for_stmt
6565                                             (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6566                               TYPE_VECTOR_SUBPARTS (vectype)))
6567             ;
6568           else if (!loop_vinfo
6569                    || (nested_in_vect_loop
6570                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6571                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6572             return dr_explicit_realign;
6573           else
6574             return dr_explicit_realign_optimized;
6575         }
6576       if (!known_alignment_for_access_p (dr))
6577         is_packed = not_size_aligned (DR_REF (dr));
6578
6579       if (targetm.vectorize.support_vector_misalignment
6580             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6581         /* Can't software pipeline the loads, but can at least do them.  */
6582         return dr_unaligned_supported;
6583     }
6584   else
6585     {
6586       bool is_packed = false;
6587       tree type = (TREE_TYPE (DR_REF (dr)));
6588
6589       if (!known_alignment_for_access_p (dr))
6590         is_packed = not_size_aligned (DR_REF (dr));
6591
6592      if (targetm.vectorize.support_vector_misalignment
6593            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6594        return dr_unaligned_supported;
6595     }
6596
6597   /* Unsupported.  */
6598   return dr_unaligned_unsupported;
6599 }