gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55
  56 /* Return true if load- or store-lanes optab OPTAB is implemented for
  57    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  58
  59 static bool
  60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  61                               tree vectype, unsigned HOST_WIDE_INT count)
  62 {
  63   machine_mode mode;
  64   scalar_int_mode array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   limit_p = !targetm.array_mode_supported_p (mode, count);
  69   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  70                           limit_p).exists (&array_mode))
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   /* During the analysis phase, this function is called on arbitrary
 122      statements that might not have scalar results.  */
 123   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 124     return scalar_type;
 125
 126   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 127
 128   if (is_gimple_assign (stmt)
 129       && (gimple_assign_cast_p (stmt)
 130           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 131           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 132           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 133     {
 134       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 135
 136       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 137       if (rhs < lhs)
 138         scalar_type = rhs_type;
 139     }
 140
 141   *lhs_size_unit = lhs;
 142   *rhs_size_unit = rhs;
 143   return scalar_type;
 144 }
 145
 146
 147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 148    tested at run-time.  Return TRUE if DDR was successfully inserted.
 149    Return false if versioning is not supported.  */
 150
 151 static bool
 152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 153 {
 154   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 155
 156   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 157     return false;
 158
 159   if (!runtime_alias_check_p (ddr, loop,
 160                               optimize_loop_nest_for_speed_p (loop)))
 161     return false;
 162
 163   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 164   return true;
 165 }
 166
 167
 168 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 169    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 170    distances.  These distances are conservatively correct but they don't
 171    reflect a guaranteed dependence.
 172
 173    Return true if this function does all the work necessary to avoid
 174    an alias or false if the caller should use the dependence distances
 175    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 176    the depth of the loop described by LOOP_VINFO and the other arguments
 177    are as for vect_analyze_data_ref_dependence.  */
 178
 179 static bool
 180 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 181                                        loop_vec_info loop_vinfo,
 182                                        int loop_depth, int *max_vf)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   lambda_vector dist_v;
 186   unsigned int i;
 187   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 188     {
 189       int dist = dist_v[loop_depth];
 190       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 191         {
 192           /* If the user asserted safelen >= DIST consecutive iterations
 193              can be executed concurrently, assume independence.
 194
 195              ??? An alternative would be to add the alias check even
 196              in this case, and vectorize the fallback loop with the
 197              maximum VF set to safelen.  However, if the user has
 198              explicitly given a length, it's less likely that that
 199              would be a win.  */
 200           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 201             {
 202               if (loop->safelen < *max_vf)
 203                 *max_vf = loop->safelen;
 204               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 205               continue;
 206             }
 207
 208           /* For dependence distances of 2 or more, we have the option
 209              of limiting VF or checking for an alias at runtime.
 210              Prefer to check at runtime if we can, to avoid limiting
 211              the VF unnecessarily when the bases are in fact independent.
 212
 213              Note that the alias checks will be removed if the VF ends up
 214              being small enough.  */
 215           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 216         }
 217     }
 218   return true;
 219 }
 220
 221
 222 /* Function vect_analyze_data_ref_dependence.
 223
 224    Return TRUE if there (might) exist a dependence between a memory-reference
 225    DRA and a memory-reference DRB.  When versioning for alias may check a
 226    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 227    the data dependence.  */
 228
 229 static bool
 230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 231                                   loop_vec_info loop_vinfo, int *max_vf)
 232 {
 233   unsigned int i;
 234   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 235   struct data_reference *dra = DDR_A (ddr);
 236   struct data_reference *drb = DDR_B (ddr);
 237   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 238   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 239   lambda_vector dist_v;
 240   unsigned int loop_depth;
 241
 242   /* In loop analysis all data references should be vectorizable.  */
 243   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 244       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 245     gcc_unreachable ();
 246
 247   /* Independent data accesses.  */
 248   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 249     return false;
 250
 251   if (dra == drb
 252       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 253     return false;
 254
 255   /* We do not have to consider dependences between accesses that belong
 256      to the same group.  */
 257   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 258       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 259     return false;
 260
 261   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 262      least two scalar iterations, there is always also a true dependence.
 263      As the vectorizer does not re-order loads and stores we can ignore
 264      the anti-dependence if TBAA can disambiguate both DRs similar to the
 265      case with known negative distance anti-dependences (positive
 266      distance anti-dependences would violate TBAA constraints).  */
 267   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 268        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 269       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 270                                  get_alias_set (DR_REF (drb))))
 271     return false;
 272
 273   /* Unknown data dependence.  */
 274   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 275     {
 276       /* If user asserted safelen consecutive iterations can be
 277          executed concurrently, assume independence.  */
 278       if (loop->safelen >= 2)
 279         {
 280           if (loop->safelen < *max_vf)
 281             *max_vf = loop->safelen;
 282           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 283           return false;
 284         }
 285
 286       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 287           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 288         {
 289           if (dump_enabled_p ())
 290             {
 291               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 292                                "versioning for alias not supported for: "
 293                                "can't determine dependence between ");
 294               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 295                                  DR_REF (dra));
 296               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 297               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 298                                  DR_REF (drb));
 299               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 300             }
 301           return true;
 302         }
 303
 304       if (dump_enabled_p ())
 305         {
 306           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 307                            "versioning for alias required: "
 308                            "can't determine dependence between ");
 309           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 310                              DR_REF (dra));
 311           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 312           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 313                              DR_REF (drb));
 314           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 315         }
 316
 317       /* Add to list of ddrs that need to be tested at run-time.  */
 318       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 319     }
 320
 321   /* Known data dependence.  */
 322   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 323     {
 324       /* If user asserted safelen consecutive iterations can be
 325          executed concurrently, assume independence.  */
 326       if (loop->safelen >= 2)
 327         {
 328           if (loop->safelen < *max_vf)
 329             *max_vf = loop->safelen;
 330           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 331           return false;
 332         }
 333
 334       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 335           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 336         {
 337           if (dump_enabled_p ())
 338             {
 339               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                "versioning for alias not supported for: "
 341                                "bad dist vector for ");
 342               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                  DR_REF (dra));
 344               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 345               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 346                                  DR_REF (drb));
 347               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 348             }
 349           return true;
 350         }
 351
 352       if (dump_enabled_p ())
 353         {
 354           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                            "versioning for alias required: "
 356                            "bad dist vector for ");
 357           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 358           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 360           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 361         }
 362       /* Add to list of ddrs that need to be tested at run-time.  */
 363       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 364     }
 365
 366   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 367
 368   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 369       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 370                                                 loop_depth, max_vf))
 371     return false;
 372
 373   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 374     {
 375       int dist = dist_v[loop_depth];
 376
 377       if (dump_enabled_p ())
 378         dump_printf_loc (MSG_NOTE, vect_location,
 379                          "dependence distance  = %d.\n", dist);
 380
 381       if (dist == 0)
 382         {
 383           if (dump_enabled_p ())
 384             {
 385               dump_printf_loc (MSG_NOTE, vect_location,
 386                                "dependence distance == 0 between ");
 387               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 388               dump_printf (MSG_NOTE, " and ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 390               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 391             }
 392
 393           /* When we perform grouped accesses and perform implicit CSE
 394              by detecting equal accesses and doing disambiguation with
 395              runtime alias tests like for
 396                 .. = a[i];
 397                 .. = a[i+1];
 398                 a[i] = ..;
 399                 a[i+1] = ..;
 400                 *p = ..;
 401                 .. = a[i];
 402                 .. = a[i+1];
 403              where we will end up loading { a[i], a[i+1] } once, make
 404              sure that inserting group loads before the first load and
 405              stores after the last store will do the right thing.
 406              Similar for groups like
 407                 a[i] = ...;
 408                 ... = a[i];
 409                 a[i+1] = ...;
 410              where loads from the group interleave with the store.  */
 411           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 412               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 413             {
 414               gimple *earlier_stmt;
 415               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 416               if (DR_IS_WRITE
 417                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 418                 {
 419                   if (dump_enabled_p ())
 420                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                                      "READ_WRITE dependence in interleaving."
 422                                      "\n");
 423                   return true;
 424                 }
 425             }
 426
 427           continue;
 428         }
 429
 430       if (dist > 0 && DDR_REVERSED_P (ddr))
 431         {
 432           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 433              reversed (to make distance vector positive), and the actual
 434              distance is negative.  */
 435           if (dump_enabled_p ())
 436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                              "dependence distance negative.\n");
 438           /* Record a negative dependence distance to later limit the
 439              amount of stmt copying / unrolling we can perform.
 440              Only need to handle read-after-write dependence.  */
 441           if (DR_IS_READ (drb)
 442               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 443                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 444             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 445           continue;
 446         }
 447
 448       if (abs (dist) >= 2
 449           && abs (dist) < *max_vf)
 450         {
 451           /* The dependence distance requires reduction of the maximal
 452              vectorization factor.  */
 453           *max_vf = abs (dist);
 454           if (dump_enabled_p ())
 455             dump_printf_loc (MSG_NOTE, vect_location,
 456                              "adjusting maximal vectorization factor to %i\n",
 457                              *max_vf);
 458         }
 459
 460       if (abs (dist) >= *max_vf)
 461         {
 462           /* Dependence distance does not create dependence, as far as
 463              vectorization is concerned, in this case.  */
 464           if (dump_enabled_p ())
 465             dump_printf_loc (MSG_NOTE, vect_location,
 466                              "dependence distance >= VF.\n");
 467           continue;
 468         }
 469
 470       if (dump_enabled_p ())
 471         {
 472           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 473                        "not vectorized, possible dependence "
 474                        "between data-refs ");
 475           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 476           dump_printf (MSG_NOTE,  " and ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 478           dump_printf (MSG_NOTE,  "\n");
 479         }
 480
 481       return true;
 482     }
 483
 484   return false;
 485 }
 486
 487 /* Function vect_analyze_data_ref_dependences.
 488
 489    Examine all the data references in the loop, and make sure there do not
 490    exist any data dependences between them.  Set *MAX_VF according to
 491    the maximum vectorization factor the data dependences allow.  */
 492
 493 bool
 494 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 495 {
 496   unsigned int i;
 497   struct data_dependence_relation *ddr;
 498
 499   if (dump_enabled_p ())
 500     dump_printf_loc (MSG_NOTE, vect_location,
 501                      "=== vect_analyze_data_ref_dependences ===\n");
 502
 503   LOOP_VINFO_DDRS (loop_vinfo)
 504     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 505              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 506   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 507   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 508   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 509                                 &LOOP_VINFO_DDRS (loop_vinfo),
 510                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 511     return false;
 512
 513   /* For epilogues we either have no aliases or alias versioning
 514      was applied to original loop.  Therefore we may just get max_vf
 515      using VF of original loop.  */
 516   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 517     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 518   else
 519     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 520       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 521         return false;
 522
 523   return true;
 524 }
 525
 526
 527 /* Function vect_slp_analyze_data_ref_dependence.
 528
 529    Return TRUE if there (might) exist a dependence between a memory-reference
 530    DRA and a memory-reference DRB.  When versioning for alias may check a
 531    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 532    the data dependence.  */
 533
 534 static bool
 535 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 536 {
 537   struct data_reference *dra = DDR_A (ddr);
 538   struct data_reference *drb = DDR_B (ddr);
 539
 540   /* We need to check dependences of statements marked as unvectorizable
 541      as well, they still can prohibit vectorization.  */
 542
 543   /* Independent data accesses.  */
 544   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 545     return false;
 546
 547   if (dra == drb)
 548     return false;
 549
 550   /* Read-read is OK.  */
 551   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 552     return false;
 553
 554   /* If dra and drb are part of the same interleaving chain consider
 555      them independent.  */
 556   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 557       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 558           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 559     return false;
 560
 561   /* Unknown data dependence.  */
 562   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 563     {
 564       if  (dump_enabled_p ())
 565         {
 566           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 567                            "can't determine dependence between ");
 568           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 569           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 570           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 571           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 572         }
 573     }
 574   else if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location,
 577                        "determined dependence between ");
 578       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 579       dump_printf (MSG_NOTE, " and ");
 580       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 581       dump_printf (MSG_NOTE,  "\n");
 582     }
 583
 584   return true;
 585 }
 586
 587
 588 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 589    contain the vector of scalar stores of this instance if we are
 590    disambiguating the loads.  */
 591
 592 static bool
 593 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 594                                    vec<gimple *> stores, gimple *last_store)
 595 {
 596   /* This walks over all stmts involved in the SLP load/store done
 597      in NODE verifying we can sink them up to the last stmt in the
 598      group.  */
 599   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 600   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 601     {
 602       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 603       if (access == last_access)
 604         continue;
 605       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 606       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 607            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 608         {
 609           gimple *stmt = gsi_stmt (gsi);
 610           if (! gimple_vuse (stmt)
 611               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 612             continue;
 613
 614           /* If we couldn't record a (single) data reference for this
 615              stmt we have to give up.  */
 616           /* ???  Here and below if dependence analysis fails we can resort
 617              to the alias oracle which can handle more kinds of stmts.  */
 618           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 619           if (!dr_b)
 620             return false;
 621
 622           bool dependent = false;
 623           /* If we run into a store of this same instance (we've just
 624              marked those) then delay dependence checking until we run
 625              into the last store because this is where it will have
 626              been sunk to (and we verify if we can do that as well).  */
 627           if (gimple_visited_p (stmt))
 628             {
 629               if (stmt != last_store)
 630                 continue;
 631               unsigned i;
 632               gimple *store;
 633               FOR_EACH_VEC_ELT (stores, i, store)
 634                 {
 635                   data_reference *store_dr
 636                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 637                   ddr_p ddr = initialize_data_dependence_relation
 638                                 (dr_a, store_dr, vNULL);
 639                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 640                   free_dependence_relation (ddr);
 641                   if (dependent)
 642                     break;
 643                 }
 644             }
 645           else
 646             {
 647               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 648                                                                dr_b, vNULL);
 649               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 650               free_dependence_relation (ddr);
 651             }
 652           if (dependent)
 653             return false;
 654         }
 655     }
 656   return true;
 657 }
 658
 659
 660 /* Function vect_analyze_data_ref_dependences.
 661
 662    Examine all the data references in the basic-block, and make sure there
 663    do not exist any data dependences between them.  Set *MAX_VF according to
 664    the maximum vectorization factor the data dependences allow.  */
 665
 666 bool
 667 vect_slp_analyze_instance_dependence (slp_instance instance)
 668 {
 669   if (dump_enabled_p ())
 670     dump_printf_loc (MSG_NOTE, vect_location,
 671                      "=== vect_slp_analyze_instance_dependence ===\n");
 672
 673   /* The stores of this instance are at the root of the SLP tree.  */
 674   slp_tree store = SLP_INSTANCE_TREE (instance);
 675   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 676     store = NULL;
 677
 678   /* Verify we can sink stores to the vectorized stmt insert location.  */
 679   gimple *last_store = NULL;
 680   if (store)
 681     {
 682       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 683         return false;
 684
 685       /* Mark stores in this instance and remember the last one.  */
 686       last_store = vect_find_last_scalar_stmt_in_slp (store);
 687       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 688         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 689     }
 690
 691   bool res = true;
 692
 693   /* Verify we can sink loads to the vectorized stmt insert location,
 694      special-casing stores of this instance.  */
 695   slp_tree load;
 696   unsigned int i;
 697   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 698     if (! vect_slp_analyze_node_dependences (instance, load,
 699                                              store
 700                                              ? SLP_TREE_SCALAR_STMTS (store)
 701                                              : vNULL, last_store))
 702       {
 703         res = false;
 704         break;
 705       }
 706
 707   /* Unset the visited flag.  */
 708   if (store)
 709     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 710       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 711
 712   return res;
 713 }
 714
 715 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 716    the statement that contains DRB, which is useful for recording in the
 717    dump file.  */
 718
 719 static void
 720 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 721                             innermost_loop_behavior *drb)
 722 {
 723   bool existed;
 724   innermost_loop_behavior *&entry
 725     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 726   if (!existed || entry->base_alignment < drb->base_alignment)
 727     {
 728       entry = drb;
 729       if (dump_enabled_p ())
 730         {
 731           dump_printf_loc (MSG_NOTE, vect_location,
 732                            "recording new base alignment for ");
 733           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 734           dump_printf (MSG_NOTE, "\n");
 735           dump_printf_loc (MSG_NOTE, vect_location,
 736                            "  alignment:    %d\n", drb->base_alignment);
 737           dump_printf_loc (MSG_NOTE, vect_location,
 738                            "  misalignment: %d\n", drb->base_misalignment);
 739           dump_printf_loc (MSG_NOTE, vect_location,
 740                            "  based on:     ");
 741           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 742         }
 743     }
 744 }
 745
 746 /* If the region we're going to vectorize is reached, all unconditional
 747    data references occur at least once.  We can therefore pool the base
 748    alignment guarantees from each unconditional reference.  Do this by
 749    going through all the data references in VINFO and checking whether
 750    the containing statement makes the reference unconditionally.  If so,
 751    record the alignment of the base address in VINFO so that it can be
 752    used for all other references with the same base.  */
 753
 754 void
 755 vect_record_base_alignments (vec_info *vinfo)
 756 {
 757   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 758   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 759   data_reference *dr;
 760   unsigned int i;
 761   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 762     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 763       {
 764         gimple *stmt = DR_STMT (dr);
 765         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 766
 767         /* If DR is nested in the loop that is being vectorized, we can also
 768            record the alignment of the base wrt the outer loop.  */
 769         if (loop && nested_in_vect_loop_p (loop, stmt))
 770           {
 771             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 772             vect_record_base_alignment
 773               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 774           }
 775       }
 776 }
 777
 778 /* Return the target alignment for the vectorized form of DR.  */
 779
 780 static unsigned int
 781 vect_calculate_target_alignment (struct data_reference *dr)
 782 {
 783   gimple *stmt = DR_STMT (dr);
 784   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 785   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 786   return targetm.vectorize.preferred_vector_alignment (vectype);
 787 }
 788
 789 /* Function vect_compute_data_ref_alignment
 790
 791    Compute the misalignment of the data reference DR.
 792
 793    Output:
 794    1. If during the misalignment computation it is found that the data reference
 795       cannot be vectorized then false is returned.
 796    2. DR_MISALIGNMENT (DR) is defined.
 797
 798    FOR NOW: No analysis is actually performed. Misalignment is calculated
 799    only for trivial cases. TODO.  */
 800
 801 bool
 802 vect_compute_data_ref_alignment (struct data_reference *dr)
 803 {
 804   gimple *stmt = DR_STMT (dr);
 805   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 806   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 807   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 808   struct loop *loop = NULL;
 809   tree ref = DR_REF (dr);
 810   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 811
 812   if (dump_enabled_p ())
 813     dump_printf_loc (MSG_NOTE, vect_location,
 814                      "vect_compute_data_ref_alignment:\n");
 815
 816   if (loop_vinfo)
 817     loop = LOOP_VINFO_LOOP (loop_vinfo);
 818
 819   /* Initialize misalignment to unknown.  */
 820   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 821
 822   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 823   bool step_preserves_misalignment_p;
 824
 825   unsigned HOST_WIDE_INT vector_alignment
 826     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 827   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 828
 829   /* No step for BB vectorization.  */
 830   if (!loop)
 831     {
 832       gcc_assert (integer_zerop (drb->step));
 833       step_preserves_misalignment_p = true;
 834     }
 835
 836   /* In case the dataref is in an inner-loop of the loop that is being
 837      vectorized (LOOP), we use the base and misalignment information
 838      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 839      stays the same throughout the execution of the inner-loop, which is why
 840      we have to check that the stride of the dataref in the inner-loop evenly
 841      divides by the vector alignment.  */
 842   else if (nested_in_vect_loop_p (loop, stmt))
 843     {
 844       step_preserves_misalignment_p
 845         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 846
 847       if (dump_enabled_p ())
 848         {
 849           if (step_preserves_misalignment_p)
 850             dump_printf_loc (MSG_NOTE, vect_location,
 851                              "inner step divides the vector alignment.\n");
 852           else
 853             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 854                              "inner step doesn't divide the vector"
 855                              " alignment.\n");
 856         }
 857     }
 858
 859   /* Similarly we can only use base and misalignment information relative to
 860      an innermost loop if the misalignment stays the same throughout the
 861      execution of the loop.  As above, this is the case if the stride of
 862      the dataref evenly divides by the alignment.  */
 863   else
 864     {
 865       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 866       step_preserves_misalignment_p
 867         = ((DR_STEP_ALIGNMENT (dr) * vf) % vector_alignment) == 0;
 868
 869       if (!step_preserves_misalignment_p && dump_enabled_p ())
 870         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 871                          "step doesn't divide the vector alignment.\n");
 872     }
 873
 874   unsigned int base_alignment = drb->base_alignment;
 875   unsigned int base_misalignment = drb->base_misalignment;
 876
 877   /* Calculate the maximum of the pooled base address alignment and the
 878      alignment that we can compute for DR itself.  */
 879   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 880   if (entry && base_alignment < (*entry)->base_alignment)
 881     {
 882       base_alignment = (*entry)->base_alignment;
 883       base_misalignment = (*entry)->base_misalignment;
 884     }
 885
 886   if (drb->offset_alignment < vector_alignment
 887       || !step_preserves_misalignment_p
 888       /* We need to know whether the step wrt the vectorized loop is
 889          negative when computing the starting misalignment below.  */
 890       || TREE_CODE (drb->step) != INTEGER_CST)
 891     {
 892       if (dump_enabled_p ())
 893         {
 894           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 895                            "Unknown alignment for access: ");
 896           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 897           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 898         }
 899       return true;
 900     }
 901
 902   if (base_alignment < vector_alignment)
 903     {
 904       tree base = drb->base_address;
 905       if (TREE_CODE (base) == ADDR_EXPR)
 906         base = TREE_OPERAND (base, 0);
 907       if (!vect_can_force_dr_alignment_p (base,
 908                                           vector_alignment * BITS_PER_UNIT))
 909         {
 910           if (dump_enabled_p ())
 911             {
 912               dump_printf_loc (MSG_NOTE, vect_location,
 913                                "can't force alignment of ref: ");
 914               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 915               dump_printf (MSG_NOTE, "\n");
 916             }
 917           return true;
 918         }
 919
 920       if (DECL_USER_ALIGN (base))
 921         {
 922           if (dump_enabled_p ())
 923             {
 924               dump_printf_loc (MSG_NOTE, vect_location,
 925                                "not forcing alignment of user-aligned "
 926                                "variable: ");
 927               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 928               dump_printf (MSG_NOTE, "\n");
 929             }
 930           return true;
 931         }
 932
 933       /* Force the alignment of the decl.
 934          NOTE: This is the only change to the code we make during
 935          the analysis phase, before deciding to vectorize the loop.  */
 936       if (dump_enabled_p ())
 937         {
 938           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 939           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 940           dump_printf (MSG_NOTE, "\n");
 941         }
 942
 943       DR_VECT_AUX (dr)->base_decl = base;
 944       DR_VECT_AUX (dr)->base_misaligned = true;
 945       base_misalignment = 0;
 946     }
 947   unsigned int misalignment = (base_misalignment
 948                                + TREE_INT_CST_LOW (drb->init));
 949
 950   /* If this is a backward running DR then first access in the larger
 951      vectype actually is N-1 elements before the address in the DR.
 952      Adjust misalign accordingly.  */
 953   if (tree_int_cst_sgn (drb->step) < 0)
 954     /* PLUS because STEP is negative.  */
 955     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 956                      * TREE_INT_CST_LOW (drb->step));
 957
 958   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 959
 960   if (dump_enabled_p ())
 961     {
 962       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 963                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 964       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 965       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 966     }
 967
 968   return true;
 969 }
 970
 971 /* Function vect_update_misalignment_for_peel.
 972    Sets DR's misalignment
 973    - to 0 if it has the same alignment as DR_PEEL,
 974    - to the misalignment computed using NPEEL if DR's salignment is known,
 975    - to -1 (unknown) otherwise.
 976
 977    DR - the data reference whose misalignment is to be adjusted.
 978    DR_PEEL - the data reference whose misalignment is being made
 979              zero in the vector loop by the peel.
 980    NPEEL - the number of iterations in the peel loop if the misalignment
 981            of DR_PEEL is known at compile time.  */
 982
 983 static void
 984 vect_update_misalignment_for_peel (struct data_reference *dr,
 985                                    struct data_reference *dr_peel, int npeel)
 986 {
 987   unsigned int i;
 988   vec<dr_p> same_aligned_drs;
 989   struct data_reference *current_dr;
 990   int dr_size = vect_get_scalar_dr_size (dr);
 991   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
 992   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 993   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 994
 995  /* For interleaved data accesses the step in the loop must be multiplied by
 996      the size of the interleaving group.  */
 997   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 998     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 999   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1000     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1001
1002   /* It can be assumed that the data refs with the same alignment as dr_peel
1003      are aligned in the vector loop.  */
1004   same_aligned_drs
1005     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1006   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1007     {
1008       if (current_dr != dr)
1009         continue;
1010       gcc_assert (!known_alignment_for_access_p (dr)
1011                   || !known_alignment_for_access_p (dr_peel)
1012                   || (DR_MISALIGNMENT (dr) / dr_size
1013                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1014       SET_DR_MISALIGNMENT (dr, 0);
1015       return;
1016     }
1017
1018   if (known_alignment_for_access_p (dr)
1019       && known_alignment_for_access_p (dr_peel))
1020     {
1021       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1022       int misal = DR_MISALIGNMENT (dr);
1023       misal += negative ? -npeel * dr_size : npeel * dr_size;
1024       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1025       SET_DR_MISALIGNMENT (dr, misal);
1026       return;
1027     }
1028
1029   if (dump_enabled_p ())
1030     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1031                      "to unknown (-1).\n");
1032   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1033 }
1034
1035
1036 /* Function verify_data_ref_alignment
1037
1038    Return TRUE if DR can be handled with respect to alignment.  */
1039
1040 static bool
1041 verify_data_ref_alignment (data_reference_p dr)
1042 {
1043   enum dr_alignment_support supportable_dr_alignment
1044     = vect_supportable_dr_alignment (dr, false);
1045   if (!supportable_dr_alignment)
1046     {
1047       if (dump_enabled_p ())
1048         {
1049           if (DR_IS_READ (dr))
1050             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051                              "not vectorized: unsupported unaligned load.");
1052           else
1053             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1054                              "not vectorized: unsupported unaligned "
1055                              "store.");
1056
1057           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1058                              DR_REF (dr));
1059           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1060         }
1061       return false;
1062     }
1063
1064   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1065     dump_printf_loc (MSG_NOTE, vect_location,
1066                      "Vectorizing an unaligned access.\n");
1067
1068   return true;
1069 }
1070
1071 /* Function vect_verify_datarefs_alignment
1072
1073    Return TRUE if all data references in the loop can be
1074    handled with respect to alignment.  */
1075
1076 bool
1077 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1078 {
1079   vec<data_reference_p> datarefs = vinfo->datarefs;
1080   struct data_reference *dr;
1081   unsigned int i;
1082
1083   FOR_EACH_VEC_ELT (datarefs, i, dr)
1084     {
1085       gimple *stmt = DR_STMT (dr);
1086       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1087
1088       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1089         continue;
1090
1091       /* For interleaving, only the alignment of the first access matters.   */
1092       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1093           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1094         continue;
1095
1096       /* Strided accesses perform only component accesses, alignment is
1097          irrelevant for them.  */
1098       if (STMT_VINFO_STRIDED_P (stmt_info)
1099           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1100         continue;
1101
1102       if (! verify_data_ref_alignment (dr))
1103         return false;
1104     }
1105
1106   return true;
1107 }
1108
1109 /* Given an memory reference EXP return whether its alignment is less
1110    than its size.  */
1111
1112 static bool
1113 not_size_aligned (tree exp)
1114 {
1115   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1116     return true;
1117
1118   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1119           > get_object_alignment (exp));
1120 }
1121
1122 /* Function vector_alignment_reachable_p
1123
1124    Return true if vector alignment for DR is reachable by peeling
1125    a few loop iterations.  Return false otherwise.  */
1126
1127 static bool
1128 vector_alignment_reachable_p (struct data_reference *dr)
1129 {
1130   gimple *stmt = DR_STMT (dr);
1131   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1132   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1133
1134   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1135     {
1136       /* For interleaved access we peel only if number of iterations in
1137          the prolog loop ({VF - misalignment}), is a multiple of the
1138          number of the interleaved accesses.  */
1139       int elem_size, mis_in_elements;
1140       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1141
1142       /* FORNOW: handle only known alignment.  */
1143       if (!known_alignment_for_access_p (dr))
1144         return false;
1145
1146       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1147       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1148
1149       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1150         return false;
1151     }
1152
1153   /* If misalignment is known at the compile time then allow peeling
1154      only if natural alignment is reachable through peeling.  */
1155   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1156     {
1157       HOST_WIDE_INT elmsize =
1158                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1159       if (dump_enabled_p ())
1160         {
1161           dump_printf_loc (MSG_NOTE, vect_location,
1162                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1163           dump_printf (MSG_NOTE,
1164                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1165         }
1166       if (DR_MISALIGNMENT (dr) % elmsize)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "data size does not divide the misalignment.\n");
1171           return false;
1172         }
1173     }
1174
1175   if (!known_alignment_for_access_p (dr))
1176     {
1177       tree type = TREE_TYPE (DR_REF (dr));
1178       bool is_packed = not_size_aligned (DR_REF (dr));
1179       if (dump_enabled_p ())
1180         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181                          "Unknown misalignment, %snaturally aligned\n",
1182                          is_packed ? "not " : "");
1183       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1184     }
1185
1186   return true;
1187 }
1188
1189
1190 /* Calculate the cost of the memory access represented by DR.  */
1191
1192 static void
1193 vect_get_data_access_cost (struct data_reference *dr,
1194                            unsigned int *inside_cost,
1195                            unsigned int *outside_cost,
1196                            stmt_vector_for_cost *body_cost_vec)
1197 {
1198   gimple *stmt = DR_STMT (dr);
1199   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1200   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1201   int ncopies;
1202
1203   if (PURE_SLP_STMT (stmt_info))
1204     ncopies = 1;
1205   else
1206     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1207
1208   if (DR_IS_READ (dr))
1209     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1210                         NULL, body_cost_vec, false);
1211   else
1212     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1213
1214   if (dump_enabled_p ())
1215     dump_printf_loc (MSG_NOTE, vect_location,
1216                      "vect_get_data_access_cost: inside_cost = %d, "
1217                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1218 }
1219
1220
1221 typedef struct _vect_peel_info
1222 {
1223   struct data_reference *dr;
1224   int npeel;
1225   unsigned int count;
1226 } *vect_peel_info;
1227
1228 typedef struct _vect_peel_extended_info
1229 {
1230   struct _vect_peel_info peel_info;
1231   unsigned int inside_cost;
1232   unsigned int outside_cost;
1233 } *vect_peel_extended_info;
1234
1235
1236 /* Peeling hashtable helpers.  */
1237
1238 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1239 {
1240   static inline hashval_t hash (const _vect_peel_info *);
1241   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1242 };
1243
1244 inline hashval_t
1245 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1246 {
1247   return (hashval_t) peel_info->npeel;
1248 }
1249
1250 inline bool
1251 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1252 {
1253   return (a->npeel == b->npeel);
1254 }
1255
1256
1257 /* Insert DR into peeling hash table with NPEEL as key.  */
1258
1259 static void
1260 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1261                           loop_vec_info loop_vinfo, struct data_reference *dr,
1262                           int npeel)
1263 {
1264   struct _vect_peel_info elem, *slot;
1265   _vect_peel_info **new_slot;
1266   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1267
1268   elem.npeel = npeel;
1269   slot = peeling_htab->find (&elem);
1270   if (slot)
1271     slot->count++;
1272   else
1273     {
1274       slot = XNEW (struct _vect_peel_info);
1275       slot->npeel = npeel;
1276       slot->dr = dr;
1277       slot->count = 1;
1278       new_slot = peeling_htab->find_slot (slot, INSERT);
1279       *new_slot = slot;
1280     }
1281
1282   if (!supportable_dr_alignment
1283       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1284     slot->count += VECT_MAX_COST;
1285 }
1286
1287
1288 /* Traverse peeling hash table to find peeling option that aligns maximum
1289    number of data accesses.  */
1290
1291 int
1292 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1293                                      _vect_peel_extended_info *max)
1294 {
1295   vect_peel_info elem = *slot;
1296
1297   if (elem->count > max->peel_info.count
1298       || (elem->count == max->peel_info.count
1299           && max->peel_info.npeel > elem->npeel))
1300     {
1301       max->peel_info.npeel = elem->npeel;
1302       max->peel_info.count = elem->count;
1303       max->peel_info.dr = elem->dr;
1304     }
1305
1306   return 1;
1307 }
1308
1309 /* Get the costs of peeling NPEEL iterations checking data access costs
1310    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1311    misalignment will be zero after peeling.  */
1312
1313 static void
1314 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1315                                 struct data_reference *dr0,
1316                                 unsigned int *inside_cost,
1317                                 unsigned int *outside_cost,
1318                                 stmt_vector_for_cost *body_cost_vec,
1319                                 unsigned int npeel,
1320                                 bool unknown_misalignment)
1321 {
1322   unsigned i;
1323   data_reference *dr;
1324
1325   FOR_EACH_VEC_ELT (datarefs, i, dr)
1326     {
1327       gimple *stmt = DR_STMT (dr);
1328       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1329       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1330         continue;
1331
1332       /* For interleaving, only the alignment of the first access
1333          matters.  */
1334       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1335           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1336         continue;
1337
1338       /* Strided accesses perform only component accesses, alignment is
1339          irrelevant for them.  */
1340       if (STMT_VINFO_STRIDED_P (stmt_info)
1341           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1342         continue;
1343
1344       int save_misalignment;
1345       save_misalignment = DR_MISALIGNMENT (dr);
1346       if (npeel == 0)
1347         ;
1348       else if (unknown_misalignment && dr == dr0)
1349         SET_DR_MISALIGNMENT (dr, 0);
1350       else
1351         vect_update_misalignment_for_peel (dr, dr0, npeel);
1352       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1353                                  body_cost_vec);
1354       SET_DR_MISALIGNMENT (dr, save_misalignment);
1355     }
1356 }
1357
1358 /* Traverse peeling hash table and calculate cost for each peeling option.
1359    Find the one with the lowest cost.  */
1360
1361 int
1362 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1363                                    _vect_peel_extended_info *min)
1364 {
1365   vect_peel_info elem = *slot;
1366   int dummy;
1367   unsigned int inside_cost = 0, outside_cost = 0;
1368   gimple *stmt = DR_STMT (elem->dr);
1369   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1370   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1371   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1372                        epilogue_cost_vec;
1373
1374   prologue_cost_vec.create (2);
1375   body_cost_vec.create (2);
1376   epilogue_cost_vec.create (2);
1377
1378   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1379                                   elem->dr, &inside_cost, &outside_cost,
1380                                   &body_cost_vec, elem->npeel, false);
1381
1382   body_cost_vec.release ();
1383
1384   outside_cost += vect_get_known_peeling_cost
1385     (loop_vinfo, elem->npeel, &dummy,
1386      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1387      &prologue_cost_vec, &epilogue_cost_vec);
1388
1389   /* Prologue and epilogue costs are added to the target model later.
1390      These costs depend only on the scalar iteration cost, the
1391      number of peeling iterations finally chosen, and the number of
1392      misaligned statements.  So discard the information found here.  */
1393   prologue_cost_vec.release ();
1394   epilogue_cost_vec.release ();
1395
1396   if (inside_cost < min->inside_cost
1397       || (inside_cost == min->inside_cost
1398           && outside_cost < min->outside_cost))
1399     {
1400       min->inside_cost = inside_cost;
1401       min->outside_cost = outside_cost;
1402       min->peel_info.dr = elem->dr;
1403       min->peel_info.npeel = elem->npeel;
1404       min->peel_info.count = elem->count;
1405     }
1406
1407   return 1;
1408 }
1409
1410
1411 /* Choose best peeling option by traversing peeling hash table and either
1412    choosing an option with the lowest cost (if cost model is enabled) or the
1413    option that aligns as many accesses as possible.  */
1414
1415 static struct _vect_peel_extended_info
1416 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1417                                        loop_vec_info loop_vinfo)
1418 {
1419    struct _vect_peel_extended_info res;
1420
1421    res.peel_info.dr = NULL;
1422
1423    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1424      {
1425        res.inside_cost = INT_MAX;
1426        res.outside_cost = INT_MAX;
1427        peeling_htab->traverse <_vect_peel_extended_info *,
1428                                vect_peeling_hash_get_lowest_cost> (&res);
1429      }
1430    else
1431      {
1432        res.peel_info.count = 0;
1433        peeling_htab->traverse <_vect_peel_extended_info *,
1434                                vect_peeling_hash_get_most_frequent> (&res);
1435        res.inside_cost = 0;
1436        res.outside_cost = 0;
1437      }
1438
1439    return res;
1440 }
1441
1442 /* Return true if the new peeling NPEEL is supported.  */
1443
1444 static bool
1445 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1446                           unsigned npeel)
1447 {
1448   unsigned i;
1449   struct data_reference *dr = NULL;
1450   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1451   gimple *stmt;
1452   stmt_vec_info stmt_info;
1453   enum dr_alignment_support supportable_dr_alignment;
1454
1455   /* Ensure that all data refs can be vectorized after the peel.  */
1456   FOR_EACH_VEC_ELT (datarefs, i, dr)
1457     {
1458       int save_misalignment;
1459
1460       if (dr == dr0)
1461         continue;
1462
1463       stmt = DR_STMT (dr);
1464       stmt_info = vinfo_for_stmt (stmt);
1465       /* For interleaving, only the alignment of the first access
1466          matters.  */
1467       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1468           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1469         continue;
1470
1471       /* Strided accesses perform only component accesses, alignment is
1472          irrelevant for them.  */
1473       if (STMT_VINFO_STRIDED_P (stmt_info)
1474           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1475         continue;
1476
1477       save_misalignment = DR_MISALIGNMENT (dr);
1478       vect_update_misalignment_for_peel (dr, dr0, npeel);
1479       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1480       SET_DR_MISALIGNMENT (dr, save_misalignment);
1481
1482       if (!supportable_dr_alignment)
1483         return false;
1484     }
1485
1486   return true;
1487 }
1488
1489 /* Function vect_enhance_data_refs_alignment
1490
1491    This pass will use loop versioning and loop peeling in order to enhance
1492    the alignment of data references in the loop.
1493
1494    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1495    original loop is to be vectorized.  Any other loops that are created by
1496    the transformations performed in this pass - are not supposed to be
1497    vectorized.  This restriction will be relaxed.
1498
1499    This pass will require a cost model to guide it whether to apply peeling
1500    or versioning or a combination of the two.  For example, the scheme that
1501    intel uses when given a loop with several memory accesses, is as follows:
1502    choose one memory access ('p') which alignment you want to force by doing
1503    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1504    other accesses are not necessarily aligned, or (2) use loop versioning to
1505    generate one loop in which all accesses are aligned, and another loop in
1506    which only 'p' is necessarily aligned.
1507
1508    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1509    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1510    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1511
1512    Devising a cost model is the most critical aspect of this work.  It will
1513    guide us on which access to peel for, whether to use loop versioning, how
1514    many versions to create, etc.  The cost model will probably consist of
1515    generic considerations as well as target specific considerations (on
1516    powerpc for example, misaligned stores are more painful than misaligned
1517    loads).
1518
1519    Here are the general steps involved in alignment enhancements:
1520
1521      -- original loop, before alignment analysis:
1522         for (i=0; i<N; i++){
1523           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1524           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1525         }
1526
1527      -- After vect_compute_data_refs_alignment:
1528         for (i=0; i<N; i++){
1529           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1530           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1531         }
1532
1533      -- Possibility 1: we do loop versioning:
1534      if (p is aligned) {
1535         for (i=0; i<N; i++){    # loop 1A
1536           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1537           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1538         }
1539      }
1540      else {
1541         for (i=0; i<N; i++){    # loop 1B
1542           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1543           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1544         }
1545      }
1546
1547      -- Possibility 2: we do loop peeling:
1548      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1549         x = q[i];
1550         p[i] = y;
1551      }
1552      for (i = 3; i < N; i++){   # loop 2A
1553         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1554         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1555      }
1556
1557      -- Possibility 3: combination of loop peeling and versioning:
1558      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1559         x = q[i];
1560         p[i] = y;
1561      }
1562      if (p is aligned) {
1563         for (i = 3; i<N; i++){  # loop 3A
1564           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1565           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1566         }
1567      }
1568      else {
1569         for (i = 3; i<N; i++){  # loop 3B
1570           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1571           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1572         }
1573      }
1574
1575      These loops are later passed to loop_transform to be vectorized.  The
1576      vectorizer will use the alignment information to guide the transformation
1577      (whether to generate regular loads/stores, or with special handling for
1578      misalignment).  */
1579
1580 bool
1581 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1582 {
1583   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1584   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1585   enum dr_alignment_support supportable_dr_alignment;
1586   struct data_reference *dr0 = NULL, *first_store = NULL;
1587   struct data_reference *dr;
1588   unsigned int i, j;
1589   bool do_peeling = false;
1590   bool do_versioning = false;
1591   bool stat;
1592   gimple *stmt;
1593   stmt_vec_info stmt_info;
1594   unsigned int npeel = 0;
1595   bool one_misalignment_known = false;
1596   bool one_misalignment_unknown = false;
1597   bool one_dr_unsupportable = false;
1598   struct data_reference *unsupportable_dr = NULL;
1599   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1600   unsigned possible_npeel_number = 1;
1601   tree vectype;
1602   unsigned int nelements, mis, same_align_drs_max = 0;
1603   hash_table<peel_info_hasher> peeling_htab (1);
1604
1605   if (dump_enabled_p ())
1606     dump_printf_loc (MSG_NOTE, vect_location,
1607                      "=== vect_enhance_data_refs_alignment ===\n");
1608
1609   /* Reset data so we can safely be called multiple times.  */
1610   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1611   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1612
1613   /* While cost model enhancements are expected in the future, the high level
1614      view of the code at this time is as follows:
1615
1616      A) If there is a misaligned access then see if peeling to align
1617         this access can make all data references satisfy
1618         vect_supportable_dr_alignment.  If so, update data structures
1619         as needed and return true.
1620
1621      B) If peeling wasn't possible and there is a data reference with an
1622         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1623         then see if loop versioning checks can be used to make all data
1624         references satisfy vect_supportable_dr_alignment.  If so, update
1625         data structures as needed and return true.
1626
1627      C) If neither peeling nor versioning were successful then return false if
1628         any data reference does not satisfy vect_supportable_dr_alignment.
1629
1630      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1631
1632      Note, Possibility 3 above (which is peeling and versioning together) is not
1633      being done at this time.  */
1634
1635   /* (1) Peeling to force alignment.  */
1636
1637   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1638      Considerations:
1639      + How many accesses will become aligned due to the peeling
1640      - How many accesses will become unaligned due to the peeling,
1641        and the cost of misaligned accesses.
1642      - The cost of peeling (the extra runtime checks, the increase
1643        in code size).  */
1644
1645   FOR_EACH_VEC_ELT (datarefs, i, dr)
1646     {
1647       stmt = DR_STMT (dr);
1648       stmt_info = vinfo_for_stmt (stmt);
1649
1650       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1651         continue;
1652
1653       /* For interleaving, only the alignment of the first access
1654          matters.  */
1655       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1656           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1657         continue;
1658
1659       /* For invariant accesses there is nothing to enhance.  */
1660       if (integer_zerop (DR_STEP (dr)))
1661         continue;
1662
1663       /* Strided accesses perform only component accesses, alignment is
1664          irrelevant for them.  */
1665       if (STMT_VINFO_STRIDED_P (stmt_info)
1666           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1667         continue;
1668
1669       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1670       do_peeling = vector_alignment_reachable_p (dr);
1671       if (do_peeling)
1672         {
1673           if (known_alignment_for_access_p (dr))
1674             {
1675               unsigned int npeel_tmp = 0;
1676               bool negative = tree_int_cst_compare (DR_STEP (dr),
1677                                                     size_zero_node) < 0;
1678
1679               vectype = STMT_VINFO_VECTYPE (stmt_info);
1680               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1681               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1682               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1683               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1684               if (DR_MISALIGNMENT (dr) != 0)
1685                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1686
1687               /* For multiple types, it is possible that the bigger type access
1688                  will have more than one peeling option.  E.g., a loop with two
1689                  types: one of size (vector size / 4), and the other one of
1690                  size (vector size / 8).  Vectorization factor will 8.  If both
1691                  accesses are misaligned by 3, the first one needs one scalar
1692                  iteration to be aligned, and the second one needs 5.  But the
1693                  first one will be aligned also by peeling 5 scalar
1694                  iterations, and in that case both accesses will be aligned.
1695                  Hence, except for the immediate peeling amount, we also want
1696                  to try to add full vector size, while we don't exceed
1697                  vectorization factor.
1698                  We do this automatically for cost model, since we calculate
1699                  cost for every peeling option.  */
1700               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1701                 {
1702                   if (STMT_SLP_TYPE (stmt_info))
1703                     possible_npeel_number
1704                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1705                   else
1706                     possible_npeel_number = vf / nelements;
1707
1708                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1709                      allow peeling NELEMENTS.  */
1710                   if (DR_MISALIGNMENT (dr) == 0)
1711                     possible_npeel_number++;
1712                 }
1713
1714               /* Save info about DR in the hash table.  Also include peeling
1715                  amounts according to the explanation above.  */
1716               for (j = 0; j < possible_npeel_number; j++)
1717                 {
1718                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1719                                             dr, npeel_tmp);
1720                   npeel_tmp += target_align / dr_size;
1721                 }
1722
1723               one_misalignment_known = true;
1724             }
1725           else
1726             {
1727               /* If we don't know any misalignment values, we prefer
1728                  peeling for data-ref that has the maximum number of data-refs
1729                  with the same alignment, unless the target prefers to align
1730                  stores over load.  */
1731               unsigned same_align_drs
1732                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1733               if (!dr0
1734                   || same_align_drs_max < same_align_drs)
1735                 {
1736                   same_align_drs_max = same_align_drs;
1737                   dr0 = dr;
1738                 }
1739               /* For data-refs with the same number of related
1740                  accesses prefer the one where the misalign
1741                  computation will be invariant in the outermost loop.  */
1742               else if (same_align_drs_max == same_align_drs)
1743                 {
1744                   struct loop *ivloop0, *ivloop;
1745                   ivloop0 = outermost_invariant_loop_for_expr
1746                     (loop, DR_BASE_ADDRESS (dr0));
1747                   ivloop = outermost_invariant_loop_for_expr
1748                     (loop, DR_BASE_ADDRESS (dr));
1749                   if ((ivloop && !ivloop0)
1750                       || (ivloop && ivloop0
1751                           && flow_loop_nested_p (ivloop, ivloop0)))
1752                     dr0 = dr;
1753                 }
1754
1755               one_misalignment_unknown = true;
1756
1757               /* Check for data refs with unsupportable alignment that
1758                  can be peeled.  */
1759               if (!supportable_dr_alignment)
1760               {
1761                 one_dr_unsupportable = true;
1762                 unsupportable_dr = dr;
1763               }
1764
1765               if (!first_store && DR_IS_WRITE (dr))
1766                 first_store = dr;
1767             }
1768         }
1769       else
1770         {
1771           if (!aligned_access_p (dr))
1772             {
1773               if (dump_enabled_p ())
1774                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1775                                  "vector alignment may not be reachable\n");
1776               break;
1777             }
1778         }
1779     }
1780
1781   /* Check if we can possibly peel the loop.  */
1782   if (!vect_can_advance_ivs_p (loop_vinfo)
1783       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1784       || loop->inner)
1785     do_peeling = false;
1786
1787   struct _vect_peel_extended_info peel_for_known_alignment;
1788   struct _vect_peel_extended_info peel_for_unknown_alignment;
1789   struct _vect_peel_extended_info best_peel;
1790
1791   peel_for_unknown_alignment.inside_cost = INT_MAX;
1792   peel_for_unknown_alignment.outside_cost = INT_MAX;
1793   peel_for_unknown_alignment.peel_info.count = 0;
1794
1795   if (do_peeling
1796       && one_misalignment_unknown)
1797     {
1798       /* Check if the target requires to prefer stores over loads, i.e., if
1799          misaligned stores are more expensive than misaligned loads (taking
1800          drs with same alignment into account).  */
1801       unsigned int load_inside_cost = 0;
1802       unsigned int load_outside_cost = 0;
1803       unsigned int store_inside_cost = 0;
1804       unsigned int store_outside_cost = 0;
1805
1806       stmt_vector_for_cost dummy;
1807       dummy.create (2);
1808       vect_get_peeling_costs_all_drs (datarefs, dr0,
1809                                       &load_inside_cost,
1810                                       &load_outside_cost,
1811                                       &dummy, vf / 2, true);
1812       dummy.release ();
1813
1814       if (first_store)
1815         {
1816           dummy.create (2);
1817           vect_get_peeling_costs_all_drs (datarefs, first_store,
1818                                           &store_inside_cost,
1819                                           &store_outside_cost,
1820                                           &dummy, vf / 2, true);
1821           dummy.release ();
1822         }
1823       else
1824         {
1825           store_inside_cost = INT_MAX;
1826           store_outside_cost = INT_MAX;
1827         }
1828
1829       if (load_inside_cost > store_inside_cost
1830           || (load_inside_cost == store_inside_cost
1831               && load_outside_cost > store_outside_cost))
1832         {
1833           dr0 = first_store;
1834           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1835           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1836         }
1837       else
1838         {
1839           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1840           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1841         }
1842
1843       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1844       prologue_cost_vec.create (2);
1845       epilogue_cost_vec.create (2);
1846
1847       int dummy2;
1848       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1849         (loop_vinfo, vf / 2, &dummy2,
1850          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1851          &prologue_cost_vec, &epilogue_cost_vec);
1852
1853       prologue_cost_vec.release ();
1854       epilogue_cost_vec.release ();
1855
1856       peel_for_unknown_alignment.peel_info.count = 1
1857         + STMT_VINFO_SAME_ALIGN_REFS
1858         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1859     }
1860
1861   peel_for_unknown_alignment.peel_info.npeel = 0;
1862   peel_for_unknown_alignment.peel_info.dr = dr0;
1863
1864   best_peel = peel_for_unknown_alignment;
1865
1866   peel_for_known_alignment.inside_cost = INT_MAX;
1867   peel_for_known_alignment.outside_cost = INT_MAX;
1868   peel_for_known_alignment.peel_info.count = 0;
1869   peel_for_known_alignment.peel_info.dr = NULL;
1870
1871   if (do_peeling && one_misalignment_known)
1872     {
1873       /* Peeling is possible, but there is no data access that is not supported
1874          unless aligned.  So we try to choose the best possible peeling from
1875          the hash table.  */
1876       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1877         (&peeling_htab, loop_vinfo);
1878     }
1879
1880   /* Compare costs of peeling for known and unknown alignment. */
1881   if (peel_for_known_alignment.peel_info.dr != NULL
1882       && peel_for_unknown_alignment.inside_cost
1883       >= peel_for_known_alignment.inside_cost)
1884     {
1885       best_peel = peel_for_known_alignment;
1886
1887       /* If the best peeling for known alignment has NPEEL == 0, perform no
1888          peeling at all except if there is an unsupportable dr that we can
1889          align.  */
1890       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1891         do_peeling = false;
1892     }
1893
1894   /* If there is an unsupportable data ref, prefer this over all choices so far
1895      since we'd have to discard a chosen peeling except when it accidentally
1896      aligned the unsupportable data ref.  */
1897   if (one_dr_unsupportable)
1898     dr0 = unsupportable_dr;
1899   else if (do_peeling)
1900     {
1901       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1902          TODO: Use nopeel_outside_cost or get rid of it?  */
1903       unsigned nopeel_inside_cost = 0;
1904       unsigned nopeel_outside_cost = 0;
1905
1906       stmt_vector_for_cost dummy;
1907       dummy.create (2);
1908       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1909                                       &nopeel_outside_cost, &dummy, 0, false);
1910       dummy.release ();
1911
1912       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1913          costs will be recorded.  */
1914       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1915       prologue_cost_vec.create (2);
1916       epilogue_cost_vec.create (2);
1917
1918       int dummy2;
1919       nopeel_outside_cost += vect_get_known_peeling_cost
1920         (loop_vinfo, 0, &dummy2,
1921          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1922          &prologue_cost_vec, &epilogue_cost_vec);
1923
1924       prologue_cost_vec.release ();
1925       epilogue_cost_vec.release ();
1926
1927       npeel = best_peel.peel_info.npeel;
1928       dr0 = best_peel.peel_info.dr;
1929
1930       /* If no peeling is not more expensive than the best peeling we
1931          have so far, don't perform any peeling.  */
1932       if (nopeel_inside_cost <= best_peel.inside_cost)
1933         do_peeling = false;
1934     }
1935
1936   if (do_peeling)
1937     {
1938       stmt = DR_STMT (dr0);
1939       stmt_info = vinfo_for_stmt (stmt);
1940       vectype = STMT_VINFO_VECTYPE (stmt_info);
1941
1942       if (known_alignment_for_access_p (dr0))
1943         {
1944           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1945                                                 size_zero_node) < 0;
1946           if (!npeel)
1947             {
1948               /* Since it's known at compile time, compute the number of
1949                  iterations in the peeled loop (the peeling factor) for use in
1950                  updating DR_MISALIGNMENT values.  The peeling factor is the
1951                  vectorization factor minus the misalignment as an element
1952                  count.  */
1953               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1954               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1955               npeel = ((mis & (target_align - 1))
1956                        / vect_get_scalar_dr_size (dr0));
1957             }
1958
1959           /* For interleaved data access every iteration accesses all the
1960              members of the group, therefore we divide the number of iterations
1961              by the group size.  */
1962           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1963           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1964             npeel /= GROUP_SIZE (stmt_info);
1965
1966           if (dump_enabled_p ())
1967             dump_printf_loc (MSG_NOTE, vect_location,
1968                              "Try peeling by %d\n", npeel);
1969         }
1970
1971       /* Ensure that all datarefs can be vectorized after the peel.  */
1972       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1973         do_peeling = false;
1974
1975       /* Check if all datarefs are supportable and log.  */
1976       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1977         {
1978           stat = vect_verify_datarefs_alignment (loop_vinfo);
1979           if (!stat)
1980             do_peeling = false;
1981           else
1982             return stat;
1983         }
1984
1985       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1986       if (do_peeling)
1987         {
1988           unsigned max_allowed_peel
1989             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1990           if (max_allowed_peel != (unsigned)-1)
1991             {
1992               unsigned max_peel = npeel;
1993               if (max_peel == 0)
1994                 {
1995                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1996                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
1997                 }
1998               if (max_peel > max_allowed_peel)
1999                 {
2000                   do_peeling = false;
2001                   if (dump_enabled_p ())
2002                     dump_printf_loc (MSG_NOTE, vect_location,
2003                         "Disable peeling, max peels reached: %d\n", max_peel);
2004                 }
2005             }
2006         }
2007
2008       /* Cost model #2 - if peeling may result in a remaining loop not
2009          iterating enough to be vectorized then do not peel.  */
2010       if (do_peeling
2011           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2012         {
2013           unsigned max_peel
2014             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2015           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2016               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2017             do_peeling = false;
2018         }
2019
2020       if (do_peeling)
2021         {
2022           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2023              If the misalignment of DR_i is identical to that of dr0 then set
2024              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2025              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2026              by the peeling factor times the element size of DR_i (MOD the
2027              vectorization factor times the size).  Otherwise, the
2028              misalignment of DR_i must be set to unknown.  */
2029           FOR_EACH_VEC_ELT (datarefs, i, dr)
2030             if (dr != dr0)
2031               {
2032                 /* Strided accesses perform only component accesses, alignment
2033                    is irrelevant for them.  */
2034                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2035                 if (STMT_VINFO_STRIDED_P (stmt_info)
2036                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2037                   continue;
2038
2039                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2040               }
2041
2042           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2043           if (npeel)
2044             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2045           else
2046             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2047               = DR_MISALIGNMENT (dr0);
2048           SET_DR_MISALIGNMENT (dr0, 0);
2049           if (dump_enabled_p ())
2050             {
2051               dump_printf_loc (MSG_NOTE, vect_location,
2052                                "Alignment of access forced using peeling.\n");
2053               dump_printf_loc (MSG_NOTE, vect_location,
2054                                "Peeling for alignment will be applied.\n");
2055             }
2056
2057           /* The inside-loop cost will be accounted for in vectorizable_load
2058              and vectorizable_store correctly with adjusted alignments.
2059              Drop the body_cst_vec on the floor here.  */
2060           stat = vect_verify_datarefs_alignment (loop_vinfo);
2061           gcc_assert (stat);
2062           return stat;
2063         }
2064     }
2065
2066   /* (2) Versioning to force alignment.  */
2067
2068   /* Try versioning if:
2069      1) optimize loop for speed
2070      2) there is at least one unsupported misaligned data ref with an unknown
2071         misalignment, and
2072      3) all misaligned data refs with a known misalignment are supported, and
2073      4) the number of runtime alignment checks is within reason.  */
2074
2075   do_versioning =
2076         optimize_loop_nest_for_speed_p (loop)
2077         && (!loop->inner); /* FORNOW */
2078
2079   if (do_versioning)
2080     {
2081       FOR_EACH_VEC_ELT (datarefs, i, dr)
2082         {
2083           stmt = DR_STMT (dr);
2084           stmt_info = vinfo_for_stmt (stmt);
2085
2086           /* For interleaving, only the alignment of the first access
2087              matters.  */
2088           if (aligned_access_p (dr)
2089               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2090                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2091             continue;
2092
2093           if (STMT_VINFO_STRIDED_P (stmt_info))
2094             {
2095               /* Strided loads perform only component accesses, alignment is
2096                  irrelevant for them.  */
2097               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098                 continue;
2099               do_versioning = false;
2100               break;
2101             }
2102
2103           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2104
2105           if (!supportable_dr_alignment)
2106             {
2107               gimple *stmt;
2108               int mask;
2109               tree vectype;
2110
2111               if (known_alignment_for_access_p (dr)
2112                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2113                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2114                 {
2115                   do_versioning = false;
2116                   break;
2117                 }
2118
2119               stmt = DR_STMT (dr);
2120               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2121               gcc_assert (vectype);
2122
2123               /* The rightmost bits of an aligned address must be zeros.
2124                  Construct the mask needed for this test.  For example,
2125                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2126                  mask must be 15 = 0xf. */
2127               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2128
2129               /* FORNOW: use the same mask to test all potentially unaligned
2130                  references in the loop.  The vectorizer currently supports
2131                  a single vector size, see the reference to
2132                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2133                  vectorization factor is computed.  */
2134               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2135                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2136               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2137               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2138                       DR_STMT (dr));
2139             }
2140         }
2141
2142       /* Versioning requires at least one misaligned data reference.  */
2143       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2144         do_versioning = false;
2145       else if (!do_versioning)
2146         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2147     }
2148
2149   if (do_versioning)
2150     {
2151       vec<gimple *> may_misalign_stmts
2152         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2153       gimple *stmt;
2154
2155       /* It can now be assumed that the data references in the statements
2156          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2157          of the loop being vectorized.  */
2158       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2159         {
2160           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2161           dr = STMT_VINFO_DATA_REF (stmt_info);
2162           SET_DR_MISALIGNMENT (dr, 0);
2163           if (dump_enabled_p ())
2164             dump_printf_loc (MSG_NOTE, vect_location,
2165                              "Alignment of access forced using versioning.\n");
2166         }
2167
2168       if (dump_enabled_p ())
2169         dump_printf_loc (MSG_NOTE, vect_location,
2170                          "Versioning for alignment will be applied.\n");
2171
2172       /* Peeling and versioning can't be done together at this time.  */
2173       gcc_assert (! (do_peeling && do_versioning));
2174
2175       stat = vect_verify_datarefs_alignment (loop_vinfo);
2176       gcc_assert (stat);
2177       return stat;
2178     }
2179
2180   /* This point is reached if neither peeling nor versioning is being done.  */
2181   gcc_assert (! (do_peeling || do_versioning));
2182
2183   stat = vect_verify_datarefs_alignment (loop_vinfo);
2184   return stat;
2185 }
2186
2187
2188 /* Function vect_find_same_alignment_drs.
2189
2190    Update group and alignment relations according to the chosen
2191    vectorization factor.  */
2192
2193 static void
2194 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2195 {
2196   struct data_reference *dra = DDR_A (ddr);
2197   struct data_reference *drb = DDR_B (ddr);
2198   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2199   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2200
2201   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2202     return;
2203
2204   if (dra == drb)
2205     return;
2206
2207   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2208       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2209       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2210     return;
2211
2212   /* Two references with distance zero have the same alignment.  */
2213   offset_int diff = (wi::to_offset (DR_INIT (dra))
2214                      - wi::to_offset (DR_INIT (drb)));
2215   if (diff != 0)
2216     {
2217       /* Get the wider of the two alignments.  */
2218       unsigned int align_a = (vect_calculate_target_alignment (dra)
2219                               / BITS_PER_UNIT);
2220       unsigned int align_b = (vect_calculate_target_alignment (drb)
2221                               / BITS_PER_UNIT);
2222       unsigned int max_align = MAX (align_a, align_b);
2223
2224       /* Require the gap to be a multiple of the larger vector alignment.  */
2225       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2226         return;
2227     }
2228
2229   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2230   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2231   if (dump_enabled_p ())
2232     {
2233       dump_printf_loc (MSG_NOTE, vect_location,
2234                        "accesses have the same alignment: ");
2235       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2236       dump_printf (MSG_NOTE,  " and ");
2237       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2238       dump_printf (MSG_NOTE, "\n");
2239     }
2240 }
2241
2242
2243 /* Function vect_analyze_data_refs_alignment
2244
2245    Analyze the alignment of the data-references in the loop.
2246    Return FALSE if a data reference is found that cannot be vectorized.  */
2247
2248 bool
2249 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2250 {
2251   if (dump_enabled_p ())
2252     dump_printf_loc (MSG_NOTE, vect_location,
2253                      "=== vect_analyze_data_refs_alignment ===\n");
2254
2255   /* Mark groups of data references with same alignment using
2256      data dependence information.  */
2257   vec<ddr_p> ddrs = vinfo->ddrs;
2258   struct data_dependence_relation *ddr;
2259   unsigned int i;
2260
2261   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2262     vect_find_same_alignment_drs (ddr);
2263
2264   vec<data_reference_p> datarefs = vinfo->datarefs;
2265   struct data_reference *dr;
2266
2267   vect_record_base_alignments (vinfo);
2268   FOR_EACH_VEC_ELT (datarefs, i, dr)
2269     {
2270       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2271       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2272           && !vect_compute_data_ref_alignment (dr))
2273         {
2274           /* Strided accesses perform only component accesses, misalignment
2275              information is irrelevant for them.  */
2276           if (STMT_VINFO_STRIDED_P (stmt_info)
2277               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2278             continue;
2279
2280           if (dump_enabled_p ())
2281             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2282                              "not vectorized: can't calculate alignment "
2283                              "for data ref.\n");
2284
2285           return false;
2286         }
2287     }
2288
2289   return true;
2290 }
2291
2292
2293 /* Analyze alignment of DRs of stmts in NODE.  */
2294
2295 static bool
2296 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2297 {
2298   /* We vectorize from the first scalar stmt in the node unless
2299      the node is permuted in which case we start from the first
2300      element in the group.  */
2301   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2302   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2303   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2304     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2305
2306   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2307   if (! vect_compute_data_ref_alignment (dr)
2308       /* For creating the data-ref pointer we need alignment of the
2309          first element anyway.  */
2310       || (dr != first_dr
2311           && ! vect_compute_data_ref_alignment (first_dr))
2312       || ! verify_data_ref_alignment (dr))
2313     {
2314       if (dump_enabled_p ())
2315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                          "not vectorized: bad data alignment in basic "
2317                          "block.\n");
2318       return false;
2319     }
2320
2321   return true;
2322 }
2323
2324 /* Function vect_slp_analyze_instance_alignment
2325
2326    Analyze the alignment of the data-references in the SLP instance.
2327    Return FALSE if a data reference is found that cannot be vectorized.  */
2328
2329 bool
2330 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2331 {
2332   if (dump_enabled_p ())
2333     dump_printf_loc (MSG_NOTE, vect_location,
2334                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2335
2336   slp_tree node;
2337   unsigned i;
2338   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2339     if (! vect_slp_analyze_and_verify_node_alignment (node))
2340       return false;
2341
2342   node = SLP_INSTANCE_TREE (instance);
2343   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2344       && ! vect_slp_analyze_and_verify_node_alignment
2345              (SLP_INSTANCE_TREE (instance)))
2346     return false;
2347
2348   return true;
2349 }
2350
2351
2352 /* Analyze groups of accesses: check that DR belongs to a group of
2353    accesses of legal size, step, etc.  Detect gaps, single element
2354    interleaving, and other special cases. Set grouped access info.
2355    Collect groups of strided stores for further use in SLP analysis.
2356    Worker for vect_analyze_group_access.  */
2357
2358 static bool
2359 vect_analyze_group_access_1 (struct data_reference *dr)
2360 {
2361   tree step = DR_STEP (dr);
2362   tree scalar_type = TREE_TYPE (DR_REF (dr));
2363   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2364   gimple *stmt = DR_STMT (dr);
2365   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2366   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2367   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2368   HOST_WIDE_INT dr_step = -1;
2369   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2370   bool slp_impossible = false;
2371
2372   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2373      size of the interleaving group (including gaps).  */
2374   if (tree_fits_shwi_p (step))
2375     {
2376       dr_step = tree_to_shwi (step);
2377       /* Check that STEP is a multiple of type size.  Otherwise there is
2378          a non-element-sized gap at the end of the group which we
2379          cannot represent in GROUP_GAP or GROUP_SIZE.
2380          ???  As we can handle non-constant step fine here we should
2381          simply remove uses of GROUP_GAP between the last and first
2382          element and instead rely on DR_STEP.  GROUP_SIZE then would
2383          simply not include that gap.  */
2384       if ((dr_step % type_size) != 0)
2385         {
2386           if (dump_enabled_p ())
2387             {
2388               dump_printf_loc (MSG_NOTE, vect_location,
2389                                "Step ");
2390               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2391               dump_printf (MSG_NOTE,
2392                            " is not a multiple of the element size for ");
2393               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2394               dump_printf (MSG_NOTE, "\n");
2395             }
2396           return false;
2397         }
2398       groupsize = absu_hwi (dr_step) / type_size;
2399     }
2400   else
2401     groupsize = 0;
2402
2403   /* Not consecutive access is possible only if it is a part of interleaving.  */
2404   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2405     {
2406       /* Check if it this DR is a part of interleaving, and is a single
2407          element of the group that is accessed in the loop.  */
2408
2409       /* Gaps are supported only for loads. STEP must be a multiple of the type
2410          size.  The size of the group must be a power of 2.  */
2411       if (DR_IS_READ (dr)
2412           && (dr_step % type_size) == 0
2413           && groupsize > 0
2414           && pow2p_hwi (groupsize))
2415         {
2416           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2417           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2418           GROUP_GAP (stmt_info) = groupsize - 1;
2419           if (dump_enabled_p ())
2420             {
2421               dump_printf_loc (MSG_NOTE, vect_location,
2422                                "Detected single element interleaving ");
2423               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2424               dump_printf (MSG_NOTE, " step ");
2425               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2426               dump_printf (MSG_NOTE, "\n");
2427             }
2428
2429           return true;
2430         }
2431
2432       if (dump_enabled_p ())
2433         {
2434           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2435                            "not consecutive access ");
2436           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2437         }
2438
2439       if (bb_vinfo)
2440         {
2441           /* Mark the statement as unvectorizable.  */
2442           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2443           return true;
2444         }
2445
2446       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2447       STMT_VINFO_STRIDED_P (stmt_info) = true;
2448       return true;
2449     }
2450
2451   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2452     {
2453       /* First stmt in the interleaving chain. Check the chain.  */
2454       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2455       struct data_reference *data_ref = dr;
2456       unsigned int count = 1;
2457       tree prev_init = DR_INIT (data_ref);
2458       gimple *prev = stmt;
2459       HOST_WIDE_INT diff, gaps = 0;
2460
2461       while (next)
2462         {
2463           /* Skip same data-refs.  In case that two or more stmts share
2464              data-ref (supported only for loads), we vectorize only the first
2465              stmt, and the rest get their vectorized loads from the first
2466              one.  */
2467           if (!tree_int_cst_compare (DR_INIT (data_ref),
2468                                      DR_INIT (STMT_VINFO_DATA_REF (
2469                                                    vinfo_for_stmt (next)))))
2470             {
2471               if (DR_IS_WRITE (data_ref))
2472                 {
2473                   if (dump_enabled_p ())
2474                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2475                                      "Two store stmts share the same dr.\n");
2476                   return false;
2477                 }
2478
2479               if (dump_enabled_p ())
2480                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481                                  "Two or more load stmts share the same dr.\n");
2482
2483               /* For load use the same data-ref load.  */
2484               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2485
2486               prev = next;
2487               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2488               continue;
2489             }
2490
2491           prev = next;
2492           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2493
2494           /* All group members have the same STEP by construction.  */
2495           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2496
2497           /* Check that the distance between two accesses is equal to the type
2498              size. Otherwise, we have gaps.  */
2499           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2500                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2501           if (diff != 1)
2502             {
2503               /* FORNOW: SLP of accesses with gaps is not supported.  */
2504               slp_impossible = true;
2505               if (DR_IS_WRITE (data_ref))
2506                 {
2507                   if (dump_enabled_p ())
2508                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2509                                      "interleaved store with gaps\n");
2510                   return false;
2511                 }
2512
2513               gaps += diff - 1;
2514             }
2515
2516           last_accessed_element += diff;
2517
2518           /* Store the gap from the previous member of the group. If there is no
2519              gap in the access, GROUP_GAP is always 1.  */
2520           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2521
2522           prev_init = DR_INIT (data_ref);
2523           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2524           /* Count the number of data-refs in the chain.  */
2525           count++;
2526         }
2527
2528       if (groupsize == 0)
2529         groupsize = count + gaps;
2530
2531       /* This could be UINT_MAX but as we are generating code in a very
2532          inefficient way we have to cap earlier.  See PR78699 for example.  */
2533       if (groupsize > 4096)
2534         {
2535           if (dump_enabled_p ())
2536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537                              "group is too large\n");
2538           return false;
2539         }
2540
2541       /* Check that the size of the interleaving is equal to count for stores,
2542          i.e., that there are no gaps.  */
2543       if (groupsize != count
2544           && !DR_IS_READ (dr))
2545         {
2546           if (dump_enabled_p ())
2547             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2548                              "interleaved store with gaps\n");
2549           return false;
2550         }
2551
2552       /* If there is a gap after the last load in the group it is the
2553          difference between the groupsize and the last accessed
2554          element.
2555          When there is no gap, this difference should be 0.  */
2556       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2557
2558       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2559       if (dump_enabled_p ())
2560         {
2561           dump_printf_loc (MSG_NOTE, vect_location,
2562                            "Detected interleaving ");
2563           if (DR_IS_READ (dr))
2564             dump_printf (MSG_NOTE, "load ");
2565           else
2566             dump_printf (MSG_NOTE, "store ");
2567           dump_printf (MSG_NOTE, "of size %u starting with ",
2568                        (unsigned)groupsize);
2569           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2570           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2571             dump_printf_loc (MSG_NOTE, vect_location,
2572                              "There is a gap of %u elements after the group\n",
2573                              GROUP_GAP (vinfo_for_stmt (stmt)));
2574         }
2575
2576       /* SLP: create an SLP data structure for every interleaving group of
2577          stores for further analysis in vect_analyse_slp.  */
2578       if (DR_IS_WRITE (dr) && !slp_impossible)
2579         {
2580           if (loop_vinfo)
2581             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2582           if (bb_vinfo)
2583             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2584         }
2585     }
2586
2587   return true;
2588 }
2589
2590 /* Analyze groups of accesses: check that DR belongs to a group of
2591    accesses of legal size, step, etc.  Detect gaps, single element
2592    interleaving, and other special cases. Set grouped access info.
2593    Collect groups of strided stores for further use in SLP analysis.  */
2594
2595 static bool
2596 vect_analyze_group_access (struct data_reference *dr)
2597 {
2598   if (!vect_analyze_group_access_1 (dr))
2599     {
2600       /* Dissolve the group if present.  */
2601       gimple *next;
2602       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2603       while (stmt)
2604         {
2605           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2606           next = GROUP_NEXT_ELEMENT (vinfo);
2607           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2608           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2609           stmt = next;
2610         }
2611       return false;
2612     }
2613   return true;
2614 }
2615
2616 /* Analyze the access pattern of the data-reference DR.
2617    In case of non-consecutive accesses call vect_analyze_group_access() to
2618    analyze groups of accesses.  */
2619
2620 static bool
2621 vect_analyze_data_ref_access (struct data_reference *dr)
2622 {
2623   tree step = DR_STEP (dr);
2624   tree scalar_type = TREE_TYPE (DR_REF (dr));
2625   gimple *stmt = DR_STMT (dr);
2626   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2627   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2628   struct loop *loop = NULL;
2629
2630   if (loop_vinfo)
2631     loop = LOOP_VINFO_LOOP (loop_vinfo);
2632
2633   if (loop_vinfo && !step)
2634     {
2635       if (dump_enabled_p ())
2636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637                          "bad data-ref access in loop\n");
2638       return false;
2639     }
2640
2641   /* Allow loads with zero step in inner-loop vectorization.  */
2642   if (loop_vinfo && integer_zerop (step))
2643     {
2644       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2645       if (!nested_in_vect_loop_p (loop, stmt))
2646         return DR_IS_READ (dr);
2647       /* Allow references with zero step for outer loops marked
2648          with pragma omp simd only - it guarantees absence of
2649          loop-carried dependencies between inner loop iterations.  */
2650       if (!loop->force_vectorize)
2651         {
2652           if (dump_enabled_p ())
2653             dump_printf_loc (MSG_NOTE, vect_location,
2654                              "zero step in inner loop of nest\n");
2655           return false;
2656         }
2657     }
2658
2659   if (loop && nested_in_vect_loop_p (loop, stmt))
2660     {
2661       /* Interleaved accesses are not yet supported within outer-loop
2662         vectorization for references in the inner-loop.  */
2663       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2664
2665       /* For the rest of the analysis we use the outer-loop step.  */
2666       step = STMT_VINFO_DR_STEP (stmt_info);
2667       if (integer_zerop (step))
2668         {
2669           if (dump_enabled_p ())
2670             dump_printf_loc (MSG_NOTE, vect_location,
2671                              "zero step in outer loop.\n");
2672           return DR_IS_READ (dr);
2673         }
2674     }
2675
2676   /* Consecutive?  */
2677   if (TREE_CODE (step) == INTEGER_CST)
2678     {
2679       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2680       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2681           || (dr_step < 0
2682               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2683         {
2684           /* Mark that it is not interleaving.  */
2685           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2686           return true;
2687         }
2688     }
2689
2690   if (loop && nested_in_vect_loop_p (loop, stmt))
2691     {
2692       if (dump_enabled_p ())
2693         dump_printf_loc (MSG_NOTE, vect_location,
2694                          "grouped access in outer loop.\n");
2695       return false;
2696     }
2697
2698
2699   /* Assume this is a DR handled by non-constant strided load case.  */
2700   if (TREE_CODE (step) != INTEGER_CST)
2701     return (STMT_VINFO_STRIDED_P (stmt_info)
2702             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2703                 || vect_analyze_group_access (dr)));
2704
2705   /* Not consecutive access - check if it's a part of interleaving group.  */
2706   return vect_analyze_group_access (dr);
2707 }
2708
2709 /* Compare two data-references DRA and DRB to group them into chunks
2710    suitable for grouping.  */
2711
2712 static int
2713 dr_group_sort_cmp (const void *dra_, const void *drb_)
2714 {
2715   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2716   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2717   int cmp;
2718
2719   /* Stabilize sort.  */
2720   if (dra == drb)
2721     return 0;
2722
2723   /* DRs in different loops never belong to the same group.  */
2724   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2725   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2726   if (loopa != loopb)
2727     return loopa->num < loopb->num ? -1 : 1;
2728
2729   /* Ordering of DRs according to base.  */
2730   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2731                                DR_BASE_ADDRESS (drb));
2732   if (cmp != 0)
2733     return cmp;
2734
2735   /* And according to DR_OFFSET.  */
2736   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2737   if (cmp != 0)
2738     return cmp;
2739
2740   /* Put reads before writes.  */
2741   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2742     return DR_IS_READ (dra) ? -1 : 1;
2743
2744   /* Then sort after access size.  */
2745   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2746                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2747   if (cmp != 0)
2748     return cmp;
2749
2750   /* And after step.  */
2751   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2752   if (cmp != 0)
2753     return cmp;
2754
2755   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2756   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2757   if (cmp == 0)
2758     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2759   return cmp;
2760 }
2761
2762 /* Function vect_analyze_data_ref_accesses.
2763
2764    Analyze the access pattern of all the data references in the loop.
2765
2766    FORNOW: the only access pattern that is considered vectorizable is a
2767            simple step 1 (consecutive) access.
2768
2769    FORNOW: handle only arrays and pointer accesses.  */
2770
2771 bool
2772 vect_analyze_data_ref_accesses (vec_info *vinfo)
2773 {
2774   unsigned int i;
2775   vec<data_reference_p> datarefs = vinfo->datarefs;
2776   struct data_reference *dr;
2777
2778   if (dump_enabled_p ())
2779     dump_printf_loc (MSG_NOTE, vect_location,
2780                      "=== vect_analyze_data_ref_accesses ===\n");
2781
2782   if (datarefs.is_empty ())
2783     return true;
2784
2785   /* Sort the array of datarefs to make building the interleaving chains
2786      linear.  Don't modify the original vector's order, it is needed for
2787      determining what dependencies are reversed.  */
2788   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2789   datarefs_copy.qsort (dr_group_sort_cmp);
2790
2791   /* Build the interleaving chains.  */
2792   for (i = 0; i < datarefs_copy.length () - 1;)
2793     {
2794       data_reference_p dra = datarefs_copy[i];
2795       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2796       stmt_vec_info lastinfo = NULL;
2797       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2798         {
2799           ++i;
2800           continue;
2801         }
2802       for (i = i + 1; i < datarefs_copy.length (); ++i)
2803         {
2804           data_reference_p drb = datarefs_copy[i];
2805           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2806           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2807             break;
2808
2809           /* ???  Imperfect sorting (non-compatible types, non-modulo
2810              accesses, same accesses) can lead to a group to be artificially
2811              split here as we don't just skip over those.  If it really
2812              matters we can push those to a worklist and re-iterate
2813              over them.  The we can just skip ahead to the next DR here.  */
2814
2815           /* DRs in a different loop should not be put into the same
2816              interleaving group.  */
2817           if (gimple_bb (DR_STMT (dra))->loop_father
2818               != gimple_bb (DR_STMT (drb))->loop_father)
2819             break;
2820
2821           /* Check that the data-refs have same first location (except init)
2822              and they are both either store or load (not load and store,
2823              not masked loads or stores).  */
2824           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2825               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2826                                         DR_BASE_ADDRESS (drb)) != 0
2827               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2828               || !gimple_assign_single_p (DR_STMT (dra))
2829               || !gimple_assign_single_p (DR_STMT (drb)))
2830             break;
2831
2832           /* Check that the data-refs have the same constant size.  */
2833           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2834           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2835           if (!tree_fits_uhwi_p (sza)
2836               || !tree_fits_uhwi_p (szb)
2837               || !tree_int_cst_equal (sza, szb))
2838             break;
2839
2840           /* Check that the data-refs have the same step.  */
2841           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2842             break;
2843
2844           /* Do not place the same access in the interleaving chain twice.  */
2845           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2846             break;
2847
2848           /* Check the types are compatible.
2849              ???  We don't distinguish this during sorting.  */
2850           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2851                                    TREE_TYPE (DR_REF (drb))))
2852             break;
2853
2854           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2855           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2856           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2857           gcc_assert (init_a <= init_b);
2858
2859           /* If init_b == init_a + the size of the type * k, we have an
2860              interleaving, and DRA is accessed before DRB.  */
2861           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2862           if (type_size_a == 0
2863               || (init_b - init_a) % type_size_a != 0)
2864             break;
2865
2866           /* If we have a store, the accesses are adjacent.  This splits
2867              groups into chunks we support (we don't support vectorization
2868              of stores with gaps).  */
2869           if (!DR_IS_READ (dra)
2870               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2871                                              (DR_INIT (datarefs_copy[i-1]))
2872                   != type_size_a))
2873             break;
2874
2875           /* If the step (if not zero or non-constant) is greater than the
2876              difference between data-refs' inits this splits groups into
2877              suitable sizes.  */
2878           if (tree_fits_shwi_p (DR_STEP (dra)))
2879             {
2880               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2881               if (step != 0 && step <= (init_b - init_a))
2882                 break;
2883             }
2884
2885           if (dump_enabled_p ())
2886             {
2887               dump_printf_loc (MSG_NOTE, vect_location,
2888                                "Detected interleaving ");
2889               if (DR_IS_READ (dra))
2890                 dump_printf (MSG_NOTE, "load ");
2891               else
2892                 dump_printf (MSG_NOTE, "store ");
2893               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2894               dump_printf (MSG_NOTE,  " and ");
2895               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2896               dump_printf (MSG_NOTE, "\n");
2897             }
2898
2899           /* Link the found element into the group list.  */
2900           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2901             {
2902               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2903               lastinfo = stmtinfo_a;
2904             }
2905           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2906           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2907           lastinfo = stmtinfo_b;
2908         }
2909     }
2910
2911   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2912     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2913         && !vect_analyze_data_ref_access (dr))
2914       {
2915         if (dump_enabled_p ())
2916           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2917                            "not vectorized: complicated access pattern.\n");
2918
2919         if (is_a <bb_vec_info> (vinfo))
2920           {
2921             /* Mark the statement as not vectorizable.  */
2922             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2923             continue;
2924           }
2925         else
2926           {
2927             datarefs_copy.release ();
2928             return false;
2929           }
2930       }
2931
2932   datarefs_copy.release ();
2933   return true;
2934 }
2935
2936 /* Function vect_vfa_segment_size.
2937
2938    Create an expression that computes the size of segment
2939    that will be accessed for a data reference.  The functions takes into
2940    account that realignment loads may access one more vector.
2941
2942    Input:
2943      DR: The data reference.
2944      LENGTH_FACTOR: segment length to consider.
2945
2946    Return an expression whose value is the size of segment which will be
2947    accessed by DR.  */
2948
2949 static tree
2950 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2951 {
2952   tree segment_length;
2953
2954   if (integer_zerop (DR_STEP (dr)))
2955     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2956   else
2957     segment_length = size_binop (MULT_EXPR,
2958                                  fold_convert (sizetype, DR_STEP (dr)),
2959                                  fold_convert (sizetype, length_factor));
2960
2961   if (vect_supportable_dr_alignment (dr, false)
2962         == dr_explicit_realign_optimized)
2963     {
2964       tree vector_size = TYPE_SIZE_UNIT
2965                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2966
2967       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2968     }
2969   return segment_length;
2970 }
2971
2972 /* Function vect_no_alias_p.
2973
2974    Given data references A and B with equal base and offset, the alias
2975    relation can be decided at compilation time, return TRUE if they do
2976    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2977    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2978    respectively.  */
2979
2980 static bool
2981 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2982                  tree segment_length_a, tree segment_length_b)
2983 {
2984   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2985               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2986   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2987     return false;
2988
2989   tree seg_a_min = DR_INIT (a);
2990   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2991                                 seg_a_min, segment_length_a);
2992   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2993      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2994      [a, a+12) */
2995   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2996     {
2997       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2998       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2999                                seg_a_max, unit_size);
3000       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3001                                DR_INIT (a), unit_size);
3002     }
3003   tree seg_b_min = DR_INIT (b);
3004   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3005                                 seg_b_min, segment_length_b);
3006   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3007     {
3008       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3009       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3010                                seg_b_max, unit_size);
3011       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3012                                DR_INIT (b), unit_size);
3013     }
3014
3015   if (tree_int_cst_le (seg_a_max, seg_b_min)
3016       || tree_int_cst_le (seg_b_max, seg_a_min))
3017     return true;
3018
3019   return false;
3020 }
3021
3022 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3023    in DDR is >= VF.  */
3024
3025 static bool
3026 dependence_distance_ge_vf (data_dependence_relation *ddr,
3027                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3028 {
3029   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3030       || DDR_NUM_DIST_VECTS (ddr) == 0)
3031     return false;
3032
3033   /* If the dependence is exact, we should have limited the VF instead.  */
3034   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3035
3036   unsigned int i;
3037   lambda_vector dist_v;
3038   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3039     {
3040       HOST_WIDE_INT dist = dist_v[loop_depth];
3041       if (dist != 0
3042           && !(dist > 0 && DDR_REVERSED_P (ddr))
3043           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3044         return false;
3045     }
3046
3047   if (dump_enabled_p ())
3048     {
3049       dump_printf_loc (MSG_NOTE, vect_location,
3050                        "dependence distance between ");
3051       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3052       dump_printf (MSG_NOTE,  " and ");
3053       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3054       dump_printf (MSG_NOTE,  " is >= VF\n");
3055     }
3056
3057   return true;
3058 }
3059
3060 /* Function vect_prune_runtime_alias_test_list.
3061
3062    Prune a list of ddrs to be tested at run-time by versioning for alias.
3063    Merge several alias checks into one if possible.
3064    Return FALSE if resulting list of ddrs is longer then allowed by
3065    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3066
3067 bool
3068 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3069 {
3070   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3071   hash_set <tree_pair_hash> compared_objects;
3072
3073   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3074   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3075     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3076   vec<vec_object_pair> &check_unequal_addrs
3077     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3078   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3079   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3080
3081   ddr_p ddr;
3082   unsigned int i;
3083   tree length_factor;
3084
3085   if (dump_enabled_p ())
3086     dump_printf_loc (MSG_NOTE, vect_location,
3087                      "=== vect_prune_runtime_alias_test_list ===\n");
3088
3089   if (may_alias_ddrs.is_empty ())
3090     return true;
3091
3092   comp_alias_ddrs.create (may_alias_ddrs.length ());
3093
3094   unsigned int loop_depth
3095     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3096                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3097
3098   /* First, we collect all data ref pairs for aliasing checks.  */
3099   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3100     {
3101       int comp_res;
3102       struct data_reference *dr_a, *dr_b;
3103       gimple *dr_group_first_a, *dr_group_first_b;
3104       tree segment_length_a, segment_length_b;
3105       gimple *stmt_a, *stmt_b;
3106
3107       /* Ignore the alias if the VF we chose ended up being no greater
3108          than the dependence distance.  */
3109       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3110         continue;
3111
3112       if (DDR_OBJECT_A (ddr))
3113         {
3114           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3115           if (!compared_objects.add (new_pair))
3116             {
3117               if (dump_enabled_p ())
3118                 {
3119                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3120                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3121                   dump_printf (MSG_NOTE, " and ");
3122                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3123                   dump_printf (MSG_NOTE, " have different addresses\n");
3124                 }
3125               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3126             }
3127           continue;
3128         }
3129
3130       dr_a = DDR_A (ddr);
3131       stmt_a = DR_STMT (DDR_A (ddr));
3132       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3133       if (dr_group_first_a)
3134         {
3135           stmt_a = dr_group_first_a;
3136           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3137         }
3138
3139       dr_b = DDR_B (ddr);
3140       stmt_b = DR_STMT (DDR_B (ddr));
3141       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3142       if (dr_group_first_b)
3143         {
3144           stmt_b = dr_group_first_b;
3145           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3146         }
3147
3148       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3149         length_factor = scalar_loop_iters;
3150       else
3151         length_factor = size_int (vect_factor);
3152       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3153       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3154
3155       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3156                                         DR_BASE_ADDRESS (dr_b));
3157       if (comp_res == 0)
3158         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3159                                           DR_OFFSET (dr_b));
3160
3161       /* Alias is known at compilation time.  */
3162       if (comp_res == 0
3163           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3164           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3165           && TREE_CODE (segment_length_a) == INTEGER_CST
3166           && TREE_CODE (segment_length_b) == INTEGER_CST)
3167         {
3168           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3169             continue;
3170
3171           if (dump_enabled_p ())
3172             dump_printf_loc (MSG_NOTE, vect_location,
3173                              "not vectorized: compilation time alias.\n");
3174
3175           return false;
3176         }
3177
3178       dr_with_seg_len_pair_t dr_with_seg_len_pair
3179           (dr_with_seg_len (dr_a, segment_length_a),
3180            dr_with_seg_len (dr_b, segment_length_b));
3181
3182       /* Canonicalize pairs by sorting the two DR members.  */
3183       if (comp_res > 0)
3184         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3185
3186       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3187     }
3188
3189   prune_runtime_alias_test_list (&comp_alias_ddrs,
3190                                  (unsigned HOST_WIDE_INT) vect_factor);
3191
3192   unsigned int count = (comp_alias_ddrs.length ()
3193                         + check_unequal_addrs.length ());
3194   dump_printf_loc (MSG_NOTE, vect_location,
3195                    "improved number of alias checks from %d to %d\n",
3196                    may_alias_ddrs.length (), count);
3197   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3198     {
3199       if (dump_enabled_p ())
3200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3201                          "number of versioning for alias "
3202                          "run-time tests exceeds %d "
3203                          "(--param vect-max-version-for-alias-checks)\n",
3204                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3205       return false;
3206     }
3207
3208   return true;
3209 }
3210
3211 /* Return true if a non-affine read or write in STMT is suitable for a
3212    gather load or scatter store.  Describe the operation in *INFO if so.  */
3213
3214 bool
3215 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3216                            gather_scatter_info *info)
3217 {
3218   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3219   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3220   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3221   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3222   tree offtype = NULL_TREE;
3223   tree decl, base, off;
3224   machine_mode pmode;
3225   int punsignedp, reversep, pvolatilep = 0;
3226
3227   base = DR_REF (dr);
3228   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3229      see if we can use the def stmt of the address.  */
3230   if (is_gimple_call (stmt)
3231       && gimple_call_internal_p (stmt)
3232       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3233           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3234       && TREE_CODE (base) == MEM_REF
3235       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3236       && integer_zerop (TREE_OPERAND (base, 1))
3237       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3238     {
3239       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3240       if (is_gimple_assign (def_stmt)
3241           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3242         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3243     }
3244
3245   /* The gather and scatter builtins need address of the form
3246      loop_invariant + vector * {1, 2, 4, 8}
3247      or
3248      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3249      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3250      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3251      multiplications and additions in it.  To get a vector, we need
3252      a single SSA_NAME that will be defined in the loop and will
3253      contain everything that is not loop invariant and that can be
3254      vectorized.  The following code attempts to find such a preexistng
3255      SSA_NAME OFF and put the loop invariants into a tree BASE
3256      that can be gimplified before the loop.  */
3257   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3258                               &punsignedp, &reversep, &pvolatilep);
3259   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3260
3261   if (TREE_CODE (base) == MEM_REF)
3262     {
3263       if (!integer_zerop (TREE_OPERAND (base, 1)))
3264         {
3265           if (off == NULL_TREE)
3266             {
3267               offset_int moff = mem_ref_offset (base);
3268               off = wide_int_to_tree (sizetype, moff);
3269             }
3270           else
3271             off = size_binop (PLUS_EXPR, off,
3272                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3273         }
3274       base = TREE_OPERAND (base, 0);
3275     }
3276   else
3277     base = build_fold_addr_expr (base);
3278
3279   if (off == NULL_TREE)
3280     off = size_zero_node;
3281
3282   /* If base is not loop invariant, either off is 0, then we start with just
3283      the constant offset in the loop invariant BASE and continue with base
3284      as OFF, otherwise give up.
3285      We could handle that case by gimplifying the addition of base + off
3286      into some SSA_NAME and use that as off, but for now punt.  */
3287   if (!expr_invariant_in_loop_p (loop, base))
3288     {
3289       if (!integer_zerop (off))
3290         return false;
3291       off = base;
3292       base = size_int (pbitpos / BITS_PER_UNIT);
3293     }
3294   /* Otherwise put base + constant offset into the loop invariant BASE
3295      and continue with OFF.  */
3296   else
3297     {
3298       base = fold_convert (sizetype, base);
3299       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3300     }
3301
3302   /* OFF at this point may be either a SSA_NAME or some tree expression
3303      from get_inner_reference.  Try to peel off loop invariants from it
3304      into BASE as long as possible.  */
3305   STRIP_NOPS (off);
3306   while (offtype == NULL_TREE)
3307     {
3308       enum tree_code code;
3309       tree op0, op1, add = NULL_TREE;
3310
3311       if (TREE_CODE (off) == SSA_NAME)
3312         {
3313           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3314
3315           if (expr_invariant_in_loop_p (loop, off))
3316             return false;
3317
3318           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3319             break;
3320
3321           op0 = gimple_assign_rhs1 (def_stmt);
3322           code = gimple_assign_rhs_code (def_stmt);
3323           op1 = gimple_assign_rhs2 (def_stmt);
3324         }
3325       else
3326         {
3327           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3328             return false;
3329           code = TREE_CODE (off);
3330           extract_ops_from_tree (off, &code, &op0, &op1);
3331         }
3332       switch (code)
3333         {
3334         case POINTER_PLUS_EXPR:
3335         case PLUS_EXPR:
3336           if (expr_invariant_in_loop_p (loop, op0))
3337             {
3338               add = op0;
3339               off = op1;
3340             do_add:
3341               add = fold_convert (sizetype, add);
3342               if (scale != 1)
3343                 add = size_binop (MULT_EXPR, add, size_int (scale));
3344               base = size_binop (PLUS_EXPR, base, add);
3345               continue;
3346             }
3347           if (expr_invariant_in_loop_p (loop, op1))
3348             {
3349               add = op1;
3350               off = op0;
3351               goto do_add;
3352             }
3353           break;
3354         case MINUS_EXPR:
3355           if (expr_invariant_in_loop_p (loop, op1))
3356             {
3357               add = fold_convert (sizetype, op1);
3358               add = size_binop (MINUS_EXPR, size_zero_node, add);
3359               off = op0;
3360               goto do_add;
3361             }
3362           break;
3363         case MULT_EXPR:
3364           if (scale == 1 && tree_fits_shwi_p (op1))
3365             {
3366               scale = tree_to_shwi (op1);
3367               off = op0;
3368               continue;
3369             }
3370           break;
3371         case SSA_NAME:
3372           off = op0;
3373           continue;
3374         CASE_CONVERT:
3375           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3376               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3377             break;
3378           if (TYPE_PRECISION (TREE_TYPE (op0))
3379               == TYPE_PRECISION (TREE_TYPE (off)))
3380             {
3381               off = op0;
3382               continue;
3383             }
3384           if (TYPE_PRECISION (TREE_TYPE (op0))
3385               < TYPE_PRECISION (TREE_TYPE (off)))
3386             {
3387               off = op0;
3388               offtype = TREE_TYPE (off);
3389               STRIP_NOPS (off);
3390               continue;
3391             }
3392           break;
3393         default:
3394           break;
3395         }
3396       break;
3397     }
3398
3399   /* If at the end OFF still isn't a SSA_NAME or isn't
3400      defined in the loop, punt.  */
3401   if (TREE_CODE (off) != SSA_NAME
3402       || expr_invariant_in_loop_p (loop, off))
3403     return false;
3404
3405   if (offtype == NULL_TREE)
3406     offtype = TREE_TYPE (off);
3407
3408   if (DR_IS_READ (dr))
3409     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3410                                              offtype, scale);
3411   else
3412     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3413                                               offtype, scale);
3414
3415   if (decl == NULL_TREE)
3416     return false;
3417
3418   info->decl = decl;
3419   info->base = base;
3420   info->offset = off;
3421   info->offset_dt = vect_unknown_def_type;
3422   info->offset_vectype = NULL_TREE;
3423   info->scale = scale;
3424   return true;
3425 }
3426
3427 /* Function vect_analyze_data_refs.
3428
3429   Find all the data references in the loop or basic block.
3430
3431    The general structure of the analysis of data refs in the vectorizer is as
3432    follows:
3433    1- vect_analyze_data_refs(loop/bb): call
3434       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3435       in the loop/bb and their dependences.
3436    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3437    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3438    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3439
3440 */
3441
3442 bool
3443 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3444 {
3445   struct loop *loop = NULL;
3446   unsigned int i;
3447   struct data_reference *dr;
3448   tree scalar_type;
3449
3450   if (dump_enabled_p ())
3451     dump_printf_loc (MSG_NOTE, vect_location,
3452                      "=== vect_analyze_data_refs ===\n");
3453
3454   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3455     loop = LOOP_VINFO_LOOP (loop_vinfo);
3456
3457   /* Go through the data-refs, check that the analysis succeeded.  Update
3458      pointer from stmt_vec_info struct to DR and vectype.  */
3459
3460   vec<data_reference_p> datarefs = vinfo->datarefs;
3461   FOR_EACH_VEC_ELT (datarefs, i, dr)
3462     {
3463       gimple *stmt;
3464       stmt_vec_info stmt_info;
3465       tree base, offset, init;
3466       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3467       bool simd_lane_access = false;
3468       int vf;
3469
3470 again:
3471       if (!dr || !DR_REF (dr))
3472         {
3473           if (dump_enabled_p ())
3474             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3475                              "not vectorized: unhandled data-ref\n");
3476           return false;
3477         }
3478
3479       stmt = DR_STMT (dr);
3480       stmt_info = vinfo_for_stmt (stmt);
3481
3482       /* Discard clobbers from the dataref vector.  We will remove
3483          clobber stmts during vectorization.  */
3484       if (gimple_clobber_p (stmt))
3485         {
3486           free_data_ref (dr);
3487           if (i == datarefs.length () - 1)
3488             {
3489               datarefs.pop ();
3490               break;
3491             }
3492           datarefs.ordered_remove (i);
3493           dr = datarefs[i];
3494           goto again;
3495         }
3496
3497       /* Check that analysis of the data-ref succeeded.  */
3498       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3499           || !DR_STEP (dr))
3500         {
3501           bool maybe_gather
3502             = DR_IS_READ (dr)
3503               && !TREE_THIS_VOLATILE (DR_REF (dr))
3504               && targetm.vectorize.builtin_gather != NULL;
3505           bool maybe_scatter
3506             = DR_IS_WRITE (dr)
3507               && !TREE_THIS_VOLATILE (DR_REF (dr))
3508               && targetm.vectorize.builtin_scatter != NULL;
3509           bool maybe_simd_lane_access
3510             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3511
3512           /* If target supports vector gather loads or scatter stores, or if
3513              this might be a SIMD lane access, see if they can't be used.  */
3514           if (is_a <loop_vec_info> (vinfo)
3515               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3516               && !nested_in_vect_loop_p (loop, stmt))
3517             {
3518               struct data_reference *newdr
3519                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3520                                    DR_REF (dr), stmt, !maybe_scatter,
3521                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3522               gcc_assert (newdr != NULL && DR_REF (newdr));
3523               if (DR_BASE_ADDRESS (newdr)
3524                   && DR_OFFSET (newdr)
3525                   && DR_INIT (newdr)
3526                   && DR_STEP (newdr)
3527                   && integer_zerop (DR_STEP (newdr)))
3528                 {
3529                   if (maybe_simd_lane_access)
3530                     {
3531                       tree off = DR_OFFSET (newdr);
3532                       STRIP_NOPS (off);
3533                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3534                           && TREE_CODE (off) == MULT_EXPR
3535                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3536                         {
3537                           tree step = TREE_OPERAND (off, 1);
3538                           off = TREE_OPERAND (off, 0);
3539                           STRIP_NOPS (off);
3540                           if (CONVERT_EXPR_P (off)
3541                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3542                                                                           0)))
3543                                  < TYPE_PRECISION (TREE_TYPE (off)))
3544                             off = TREE_OPERAND (off, 0);
3545                           if (TREE_CODE (off) == SSA_NAME)
3546                             {
3547                               gimple *def = SSA_NAME_DEF_STMT (off);
3548                               tree reft = TREE_TYPE (DR_REF (newdr));
3549                               if (is_gimple_call (def)
3550                                   && gimple_call_internal_p (def)
3551                                   && (gimple_call_internal_fn (def)
3552                                       == IFN_GOMP_SIMD_LANE))
3553                                 {
3554                                   tree arg = gimple_call_arg (def, 0);
3555                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3556                                   arg = SSA_NAME_VAR (arg);
3557                                   if (arg == loop->simduid
3558                                       /* For now.  */
3559                                       && tree_int_cst_equal
3560                                            (TYPE_SIZE_UNIT (reft),
3561                                             step))
3562                                     {
3563                                       DR_OFFSET (newdr) = ssize_int (0);
3564                                       DR_STEP (newdr) = step;
3565                                       DR_OFFSET_ALIGNMENT (newdr)
3566                                         = BIGGEST_ALIGNMENT;
3567                                       DR_STEP_ALIGNMENT (newdr)
3568                                         = highest_pow2_factor (step);
3569                                       dr = newdr;
3570                                       simd_lane_access = true;
3571                                     }
3572                                 }
3573                             }
3574                         }
3575                     }
3576                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3577                     {
3578                       dr = newdr;
3579                       if (maybe_gather)
3580                         gatherscatter = GATHER;
3581                       else
3582                         gatherscatter = SCATTER;
3583                     }
3584                 }
3585               if (gatherscatter == SG_NONE && !simd_lane_access)
3586                 free_data_ref (newdr);
3587             }
3588
3589           if (gatherscatter == SG_NONE && !simd_lane_access)
3590             {
3591               if (dump_enabled_p ())
3592                 {
3593                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594                                    "not vectorized: data ref analysis "
3595                                    "failed ");
3596                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3597                 }
3598
3599               if (is_a <bb_vec_info> (vinfo))
3600                 break;
3601
3602               return false;
3603             }
3604         }
3605
3606       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3607         {
3608           if (dump_enabled_p ())
3609             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3610                              "not vectorized: base addr of dr is a "
3611                              "constant\n");
3612
3613           if (is_a <bb_vec_info> (vinfo))
3614             break;
3615
3616           if (gatherscatter != SG_NONE || simd_lane_access)
3617             free_data_ref (dr);
3618           return false;
3619         }
3620
3621       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3622         {
3623           if (dump_enabled_p ())
3624             {
3625               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3626                                "not vectorized: volatile type ");
3627               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3628             }
3629
3630           if (is_a <bb_vec_info> (vinfo))
3631             break;
3632
3633           return false;
3634         }
3635
3636       if (stmt_can_throw_internal (stmt))
3637         {
3638           if (dump_enabled_p ())
3639             {
3640               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641                                "not vectorized: statement can throw an "
3642                                "exception ");
3643               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3644             }
3645
3646           if (is_a <bb_vec_info> (vinfo))
3647             break;
3648
3649           if (gatherscatter != SG_NONE || simd_lane_access)
3650             free_data_ref (dr);
3651           return false;
3652         }
3653
3654       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3655           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3656         {
3657           if (dump_enabled_p ())
3658             {
3659               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660                                "not vectorized: statement is bitfield "
3661                                "access ");
3662               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3663             }
3664
3665           if (is_a <bb_vec_info> (vinfo))
3666             break;
3667
3668           if (gatherscatter != SG_NONE || simd_lane_access)
3669             free_data_ref (dr);
3670           return false;
3671         }
3672
3673       base = unshare_expr (DR_BASE_ADDRESS (dr));
3674       offset = unshare_expr (DR_OFFSET (dr));
3675       init = unshare_expr (DR_INIT (dr));
3676
3677       if (is_gimple_call (stmt)
3678           && (!gimple_call_internal_p (stmt)
3679               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3680                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3681         {
3682           if (dump_enabled_p ())
3683             {
3684               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3685                                "not vectorized: dr in a call ");
3686               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3687             }
3688
3689           if (is_a <bb_vec_info> (vinfo))
3690             break;
3691
3692           if (gatherscatter != SG_NONE || simd_lane_access)
3693             free_data_ref (dr);
3694           return false;
3695         }
3696
3697       /* Update DR field in stmt_vec_info struct.  */
3698
3699       /* If the dataref is in an inner-loop of the loop that is considered for
3700          for vectorization, we also want to analyze the access relative to
3701          the outer-loop (DR contains information only relative to the
3702          inner-most enclosing loop).  We do that by building a reference to the
3703          first location accessed by the inner-loop, and analyze it relative to
3704          the outer-loop.  */
3705       if (loop && nested_in_vect_loop_p (loop, stmt))
3706         {
3707           /* Build a reference to the first location accessed by the
3708              inner loop: *(BASE + INIT + OFFSET).  By construction,
3709              this address must be invariant in the inner loop, so we
3710              can consider it as being used in the outer loop.  */
3711           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3712                                           init, offset);
3713           tree init_addr = fold_build_pointer_plus (base, init_offset);
3714           tree init_ref = build_fold_indirect_ref (init_addr);
3715
3716           if (dump_enabled_p ())
3717             {
3718               dump_printf_loc (MSG_NOTE, vect_location,
3719                                "analyze in outer loop: ");
3720               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3721               dump_printf (MSG_NOTE, "\n");
3722             }
3723
3724           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3725                                      init_ref, loop))
3726             /* dr_analyze_innermost already explained the failure.  */
3727             return false;
3728
3729           if (dump_enabled_p ())
3730             {
3731               dump_printf_loc (MSG_NOTE, vect_location,
3732                                "\touter base_address: ");
3733               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3734                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3735               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3736               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3737                                  STMT_VINFO_DR_OFFSET (stmt_info));
3738               dump_printf (MSG_NOTE,
3739                            "\n\touter constant offset from base address: ");
3740               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3741                                  STMT_VINFO_DR_INIT (stmt_info));
3742               dump_printf (MSG_NOTE, "\n\touter step: ");
3743               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3744                                  STMT_VINFO_DR_STEP (stmt_info));
3745               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3746                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3747               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3748                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3749               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3750                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3751               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3752                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3753             }
3754         }
3755
3756       if (STMT_VINFO_DATA_REF (stmt_info))
3757         {
3758           if (dump_enabled_p ())
3759             {
3760               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3761                                "not vectorized: more than one data ref "
3762                                "in stmt: ");
3763               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3764             }
3765
3766           if (is_a <bb_vec_info> (vinfo))
3767             break;
3768
3769           if (gatherscatter != SG_NONE || simd_lane_access)
3770             free_data_ref (dr);
3771           return false;
3772         }
3773
3774       STMT_VINFO_DATA_REF (stmt_info) = dr;
3775       if (simd_lane_access)
3776         {
3777           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3778           free_data_ref (datarefs[i]);
3779           datarefs[i] = dr;
3780         }
3781
3782       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3783           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3784           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3785         {
3786           if (dump_enabled_p ())
3787             {
3788               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3789                                "not vectorized: base object not addressable "
3790                                "for stmt: ");
3791               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3792             }
3793           if (is_a <bb_vec_info> (vinfo))
3794             {
3795               /* In BB vectorization the ref can still participate
3796                  in dependence analysis, we just can't vectorize it.  */
3797               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3798               continue;
3799             }
3800           return false;
3801         }
3802
3803       /* Set vectype for STMT.  */
3804       scalar_type = TREE_TYPE (DR_REF (dr));
3805       STMT_VINFO_VECTYPE (stmt_info)
3806         = get_vectype_for_scalar_type (scalar_type);
3807       if (!STMT_VINFO_VECTYPE (stmt_info))
3808         {
3809           if (dump_enabled_p ())
3810             {
3811               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3812                                "not vectorized: no vectype for stmt: ");
3813               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3814               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3815               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3816                                  scalar_type);
3817               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3818             }
3819
3820           if (is_a <bb_vec_info> (vinfo))
3821             {
3822               /* No vector type is fine, the ref can still participate
3823                  in dependence analysis, we just can't vectorize it.  */
3824               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3825               continue;
3826             }
3827
3828           if (gatherscatter != SG_NONE || simd_lane_access)
3829             {
3830               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3831               if (gatherscatter != SG_NONE)
3832                 free_data_ref (dr);
3833             }
3834           return false;
3835         }
3836       else
3837         {
3838           if (dump_enabled_p ())
3839             {
3840               dump_printf_loc (MSG_NOTE, vect_location,
3841                                "got vectype for stmt: ");
3842               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3843               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3844                                  STMT_VINFO_VECTYPE (stmt_info));
3845               dump_printf (MSG_NOTE, "\n");
3846             }
3847         }
3848
3849       /* Adjust the minimal vectorization factor according to the
3850          vector type.  */
3851       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3852       if (vf > *min_vf)
3853         *min_vf = vf;
3854
3855       if (gatherscatter != SG_NONE)
3856         {
3857           gather_scatter_info gs_info;
3858           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3859                                           &gs_info)
3860               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3861             {
3862               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3863               free_data_ref (dr);
3864               if (dump_enabled_p ())
3865                 {
3866                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3867                                    (gatherscatter == GATHER) ?
3868                                    "not vectorized: not suitable for gather "
3869                                    "load " :
3870                                    "not vectorized: not suitable for scatter "
3871                                    "store ");
3872                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3873                 }
3874               return false;
3875             }
3876
3877           free_data_ref (datarefs[i]);
3878           datarefs[i] = dr;
3879           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3880         }
3881
3882       else if (is_a <loop_vec_info> (vinfo)
3883                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3884         {
3885           if (nested_in_vect_loop_p (loop, stmt))
3886             {
3887               if (dump_enabled_p ())
3888                 {
3889                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3890                                    "not vectorized: not suitable for strided "
3891                                    "load ");
3892                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3893                 }
3894               return false;
3895             }
3896           STMT_VINFO_STRIDED_P (stmt_info) = true;
3897         }
3898     }
3899
3900   /* If we stopped analysis at the first dataref we could not analyze
3901      when trying to vectorize a basic-block mark the rest of the datarefs
3902      as not vectorizable and truncate the vector of datarefs.  That
3903      avoids spending useless time in analyzing their dependence.  */
3904   if (i != datarefs.length ())
3905     {
3906       gcc_assert (is_a <bb_vec_info> (vinfo));
3907       for (unsigned j = i; j < datarefs.length (); ++j)
3908         {
3909           data_reference_p dr = datarefs[j];
3910           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3911           free_data_ref (dr);
3912         }
3913       datarefs.truncate (i);
3914     }
3915
3916   return true;
3917 }
3918
3919
3920 /* Function vect_get_new_vect_var.
3921
3922    Returns a name for a new variable.  The current naming scheme appends the
3923    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3924    the name of vectorizer generated variables, and appends that to NAME if
3925    provided.  */
3926
3927 tree
3928 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3929 {
3930   const char *prefix;
3931   tree new_vect_var;
3932
3933   switch (var_kind)
3934   {
3935   case vect_simple_var:
3936     prefix = "vect";
3937     break;
3938   case vect_scalar_var:
3939     prefix = "stmp";
3940     break;
3941   case vect_mask_var:
3942     prefix = "mask";
3943     break;
3944   case vect_pointer_var:
3945     prefix = "vectp";
3946     break;
3947   default:
3948     gcc_unreachable ();
3949   }
3950
3951   if (name)
3952     {
3953       char* tmp = concat (prefix, "_", name, NULL);
3954       new_vect_var = create_tmp_reg (type, tmp);
3955       free (tmp);
3956     }
3957   else
3958     new_vect_var = create_tmp_reg (type, prefix);
3959
3960   return new_vect_var;
3961 }
3962
3963 /* Like vect_get_new_vect_var but return an SSA name.  */
3964
3965 tree
3966 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3967 {
3968   const char *prefix;
3969   tree new_vect_var;
3970
3971   switch (var_kind)
3972   {
3973   case vect_simple_var:
3974     prefix = "vect";
3975     break;
3976   case vect_scalar_var:
3977     prefix = "stmp";
3978     break;
3979   case vect_pointer_var:
3980     prefix = "vectp";
3981     break;
3982   default:
3983     gcc_unreachable ();
3984   }
3985
3986   if (name)
3987     {
3988       char* tmp = concat (prefix, "_", name, NULL);
3989       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3990       free (tmp);
3991     }
3992   else
3993     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3994
3995   return new_vect_var;
3996 }
3997
3998 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3999
4000 static void
4001 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4002 {
4003   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4004   int misalign = DR_MISALIGNMENT (dr);
4005   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4006     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4007   else
4008     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4009                             DR_TARGET_ALIGNMENT (dr), misalign);
4010 }
4011
4012 /* Function vect_create_addr_base_for_vector_ref.
4013
4014    Create an expression that computes the address of the first memory location
4015    that will be accessed for a data reference.
4016
4017    Input:
4018    STMT: The statement containing the data reference.
4019    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4020    OFFSET: Optional. If supplied, it is be added to the initial address.
4021    LOOP:    Specify relative to which loop-nest should the address be computed.
4022             For example, when the dataref is in an inner-loop nested in an
4023             outer-loop that is now being vectorized, LOOP can be either the
4024             outer-loop, or the inner-loop.  The first memory location accessed
4025             by the following dataref ('in' points to short):
4026
4027                 for (i=0; i<N; i++)
4028                    for (j=0; j<M; j++)
4029                      s += in[i+j]
4030
4031             is as follows:
4032             if LOOP=i_loop:     &in             (relative to i_loop)
4033             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4034    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4035             initial address.  Unlike OFFSET, which is number of elements to
4036             be added, BYTE_OFFSET is measured in bytes.
4037
4038    Output:
4039    1. Return an SSA_NAME whose value is the address of the memory location of
4040       the first vector of the data reference.
4041    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4042       these statement(s) which define the returned SSA_NAME.
4043
4044    FORNOW: We are only handling array accesses with step 1.  */
4045
4046 tree
4047 vect_create_addr_base_for_vector_ref (gimple *stmt,
4048                                       gimple_seq *new_stmt_list,
4049                                       tree offset,
4050                                       tree byte_offset)
4051 {
4052   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4053   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4054   const char *base_name;
4055   tree addr_base;
4056   tree dest;
4057   gimple_seq seq = NULL;
4058   tree vect_ptr_type;
4059   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4060   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4061   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4062
4063   tree data_ref_base = unshare_expr (drb->base_address);
4064   tree base_offset = unshare_expr (drb->offset);
4065   tree init = unshare_expr (drb->init);
4066
4067   if (loop_vinfo)
4068     base_name = get_name (data_ref_base);
4069   else
4070     {
4071       base_offset = ssize_int (0);
4072       init = ssize_int (0);
4073       base_name = get_name (DR_REF (dr));
4074     }
4075
4076   /* Create base_offset */
4077   base_offset = size_binop (PLUS_EXPR,
4078                             fold_convert (sizetype, base_offset),
4079                             fold_convert (sizetype, init));
4080
4081   if (offset)
4082     {
4083       offset = fold_build2 (MULT_EXPR, sizetype,
4084                             fold_convert (sizetype, offset), step);
4085       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4086                                  base_offset, offset);
4087     }
4088   if (byte_offset)
4089     {
4090       byte_offset = fold_convert (sizetype, byte_offset);
4091       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4092                                  base_offset, byte_offset);
4093     }
4094
4095   /* base + base_offset */
4096   if (loop_vinfo)
4097     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4098   else
4099     {
4100       addr_base = build1 (ADDR_EXPR,
4101                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4102                           unshare_expr (DR_REF (dr)));
4103     }
4104
4105   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4106   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4107   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4108   gimple_seq_add_seq (new_stmt_list, seq);
4109
4110   if (DR_PTR_INFO (dr)
4111       && TREE_CODE (addr_base) == SSA_NAME
4112       && !SSA_NAME_PTR_INFO (addr_base))
4113     {
4114       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4115       if (offset || byte_offset)
4116         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4117     }
4118
4119   if (dump_enabled_p ())
4120     {
4121       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4122       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4123       dump_printf (MSG_NOTE, "\n");
4124     }
4125
4126   return addr_base;
4127 }
4128
4129
4130 /* Function vect_create_data_ref_ptr.
4131
4132    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4133    location accessed in the loop by STMT, along with the def-use update
4134    chain to appropriately advance the pointer through the loop iterations.
4135    Also set aliasing information for the pointer.  This pointer is used by
4136    the callers to this function to create a memory reference expression for
4137    vector load/store access.
4138
4139    Input:
4140    1. STMT: a stmt that references memory. Expected to be of the form
4141          GIMPLE_ASSIGN <name, data-ref> or
4142          GIMPLE_ASSIGN <data-ref, name>.
4143    2. AGGR_TYPE: the type of the reference, which should be either a vector
4144         or an array.
4145    3. AT_LOOP: the loop where the vector memref is to be created.
4146    4. OFFSET (optional): an offset to be added to the initial address accessed
4147         by the data-ref in STMT.
4148    5. BSI: location where the new stmts are to be placed if there is no loop
4149    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4150         pointing to the initial address.
4151    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4152         to the initial address accessed by the data-ref in STMT.  This is
4153         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4154         in bytes.
4155
4156    Output:
4157    1. Declare a new ptr to vector_type, and have it point to the base of the
4158       data reference (initial addressed accessed by the data reference).
4159       For example, for vector of type V8HI, the following code is generated:
4160
4161       v8hi *ap;
4162       ap = (v8hi *)initial_address;
4163
4164       if OFFSET is not supplied:
4165          initial_address = &a[init];
4166       if OFFSET is supplied:
4167          initial_address = &a[init + OFFSET];
4168       if BYTE_OFFSET is supplied:
4169          initial_address = &a[init] + BYTE_OFFSET;
4170
4171       Return the initial_address in INITIAL_ADDRESS.
4172
4173    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4174       update the pointer in each iteration of the loop.
4175
4176       Return the increment stmt that updates the pointer in PTR_INCR.
4177
4178    3. Set INV_P to true if the access pattern of the data reference in the
4179       vectorized loop is invariant.  Set it to false otherwise.
4180
4181    4. Return the pointer.  */
4182
4183 tree
4184 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4185                           tree offset, tree *initial_address,
4186                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4187                           bool only_init, bool *inv_p, tree byte_offset)
4188 {
4189   const char *base_name;
4190   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4191   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4192   struct loop *loop = NULL;
4193   bool nested_in_vect_loop = false;
4194   struct loop *containing_loop = NULL;
4195   tree aggr_ptr_type;
4196   tree aggr_ptr;
4197   tree new_temp;
4198   gimple_seq new_stmt_list = NULL;
4199   edge pe = NULL;
4200   basic_block new_bb;
4201   tree aggr_ptr_init;
4202   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4203   tree aptr;
4204   gimple_stmt_iterator incr_gsi;
4205   bool insert_after;
4206   tree indx_before_incr, indx_after_incr;
4207   gimple *incr;
4208   tree step;
4209   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4210
4211   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4212               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4213
4214   if (loop_vinfo)
4215     {
4216       loop = LOOP_VINFO_LOOP (loop_vinfo);
4217       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4218       containing_loop = (gimple_bb (stmt))->loop_father;
4219       pe = loop_preheader_edge (loop);
4220     }
4221   else
4222     {
4223       gcc_assert (bb_vinfo);
4224       only_init = true;
4225       *ptr_incr = NULL;
4226     }
4227
4228   /* Check the step (evolution) of the load in LOOP, and record
4229      whether it's invariant.  */
4230   step = vect_dr_behavior (dr)->step;
4231   if (integer_zerop (step))
4232     *inv_p = true;
4233   else
4234     *inv_p = false;
4235
4236   /* Create an expression for the first address accessed by this load
4237      in LOOP.  */
4238   base_name = get_name (DR_BASE_ADDRESS (dr));
4239
4240   if (dump_enabled_p ())
4241     {
4242       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4243       dump_printf_loc (MSG_NOTE, vect_location,
4244                        "create %s-pointer variable to type: ",
4245                        get_tree_code_name (TREE_CODE (aggr_type)));
4246       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4247       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4248         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4249       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4250         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4251       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4252         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4253       else
4254         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4255       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4256       dump_printf (MSG_NOTE, "\n");
4257     }
4258
4259   /* (1) Create the new aggregate-pointer variable.
4260      Vector and array types inherit the alias set of their component
4261      type by default so we need to use a ref-all pointer if the data
4262      reference does not conflict with the created aggregated data
4263      reference because it is not addressable.  */
4264   bool need_ref_all = false;
4265   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4266                               get_alias_set (DR_REF (dr))))
4267     need_ref_all = true;
4268   /* Likewise for any of the data references in the stmt group.  */
4269   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4270     {
4271       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4272       do
4273         {
4274           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4275           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4276           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4277                                       get_alias_set (DR_REF (sdr))))
4278             {
4279               need_ref_all = true;
4280               break;
4281             }
4282           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4283         }
4284       while (orig_stmt);
4285     }
4286   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4287                                                need_ref_all);
4288   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4289
4290
4291   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4292      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4293      def-use update cycles for the pointer: one relative to the outer-loop
4294      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4295      to the inner-loop (which is the inner-most loop containing the dataref),
4296      and this is done be step (5) below.
4297
4298      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4299      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4300      redundant.  Steps (3),(4) create the following:
4301
4302         vp0 = &base_addr;
4303         LOOP:   vp1 = phi(vp0,vp2)
4304                 ...
4305                 ...
4306                 vp2 = vp1 + step
4307                 goto LOOP
4308
4309      If there is an inner-loop nested in loop, then step (5) will also be
4310      applied, and an additional update in the inner-loop will be created:
4311
4312         vp0 = &base_addr;
4313         LOOP:   vp1 = phi(vp0,vp2)
4314                 ...
4315         inner:     vp3 = phi(vp1,vp4)
4316                    vp4 = vp3 + inner_step
4317                    if () goto inner
4318                 ...
4319                 vp2 = vp1 + step
4320                 if () goto LOOP   */
4321
4322   /* (2) Calculate the initial address of the aggregate-pointer, and set
4323      the aggregate-pointer to point to it before the loop.  */
4324
4325   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4326
4327   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4328                                                    offset, byte_offset);
4329   if (new_stmt_list)
4330     {
4331       if (pe)
4332         {
4333           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4334           gcc_assert (!new_bb);
4335         }
4336       else
4337         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4338     }
4339
4340   *initial_address = new_temp;
4341   aggr_ptr_init = new_temp;
4342
4343   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4344      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4345      inner-loop nested in LOOP (during outer-loop vectorization).  */
4346
4347   /* No update in loop is required.  */
4348   if (only_init && (!loop_vinfo || at_loop == loop))
4349     aptr = aggr_ptr_init;
4350   else
4351     {
4352       /* The step of the aggregate pointer is the type size.  */
4353       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4354       /* One exception to the above is when the scalar step of the load in
4355          LOOP is zero. In this case the step here is also zero.  */
4356       if (*inv_p)
4357         iv_step = size_zero_node;
4358       else if (tree_int_cst_sgn (step) == -1)
4359         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4360
4361       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4362
4363       create_iv (aggr_ptr_init,
4364                  fold_convert (aggr_ptr_type, iv_step),
4365                  aggr_ptr, loop, &incr_gsi, insert_after,
4366                  &indx_before_incr, &indx_after_incr);
4367       incr = gsi_stmt (incr_gsi);
4368       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4369
4370       /* Copy the points-to information if it exists. */
4371       if (DR_PTR_INFO (dr))
4372         {
4373           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4374           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4375         }
4376       if (ptr_incr)
4377         *ptr_incr = incr;
4378
4379       aptr = indx_before_incr;
4380     }
4381
4382   if (!nested_in_vect_loop || only_init)
4383     return aptr;
4384
4385
4386   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4387      nested in LOOP, if exists.  */
4388
4389   gcc_assert (nested_in_vect_loop);
4390   if (!only_init)
4391     {
4392       standard_iv_increment_position (containing_loop, &incr_gsi,
4393                                       &insert_after);
4394       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4395                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4396                  &indx_after_incr);
4397       incr = gsi_stmt (incr_gsi);
4398       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4399
4400       /* Copy the points-to information if it exists. */
4401       if (DR_PTR_INFO (dr))
4402         {
4403           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4404           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4405         }
4406       if (ptr_incr)
4407         *ptr_incr = incr;
4408
4409       return indx_before_incr;
4410     }
4411   else
4412     gcc_unreachable ();
4413 }
4414
4415
4416 /* Function bump_vector_ptr
4417
4418    Increment a pointer (to a vector type) by vector-size. If requested,
4419    i.e. if PTR-INCR is given, then also connect the new increment stmt
4420    to the existing def-use update-chain of the pointer, by modifying
4421    the PTR_INCR as illustrated below:
4422
4423    The pointer def-use update-chain before this function:
4424                         DATAREF_PTR = phi (p_0, p_2)
4425                         ....
4426         PTR_INCR:       p_2 = DATAREF_PTR + step
4427
4428    The pointer def-use update-chain after this function:
4429                         DATAREF_PTR = phi (p_0, p_2)
4430                         ....
4431                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4432                         ....
4433         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4434
4435    Input:
4436    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4437                  in the loop.
4438    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4439               the loop.  The increment amount across iterations is expected
4440               to be vector_size.
4441    BSI - location where the new update stmt is to be placed.
4442    STMT - the original scalar memory-access stmt that is being vectorized.
4443    BUMP - optional. The offset by which to bump the pointer. If not given,
4444           the offset is assumed to be vector_size.
4445
4446    Output: Return NEW_DATAREF_PTR as illustrated above.
4447
4448 */
4449
4450 tree
4451 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4452                  gimple *stmt, tree bump)
4453 {
4454   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4455   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4456   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4457   tree update = TYPE_SIZE_UNIT (vectype);
4458   gassign *incr_stmt;
4459   ssa_op_iter iter;
4460   use_operand_p use_p;
4461   tree new_dataref_ptr;
4462
4463   if (bump)
4464     update = bump;
4465
4466   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4467     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4468   else
4469     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4470   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4471                                    dataref_ptr, update);
4472   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4473
4474   /* Copy the points-to information if it exists. */
4475   if (DR_PTR_INFO (dr))
4476     {
4477       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4478       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4479     }
4480
4481   if (!ptr_incr)
4482     return new_dataref_ptr;
4483
4484   /* Update the vector-pointer's cross-iteration increment.  */
4485   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4486     {
4487       tree use = USE_FROM_PTR (use_p);
4488
4489       if (use == dataref_ptr)
4490         SET_USE (use_p, new_dataref_ptr);
4491       else
4492         gcc_assert (tree_int_cst_compare (use, update) == 0);
4493     }
4494
4495   return new_dataref_ptr;
4496 }
4497
4498
4499 /* Function vect_create_destination_var.
4500
4501    Create a new temporary of type VECTYPE.  */
4502
4503 tree
4504 vect_create_destination_var (tree scalar_dest, tree vectype)
4505 {
4506   tree vec_dest;
4507   const char *name;
4508   char *new_name;
4509   tree type;
4510   enum vect_var_kind kind;
4511
4512   kind = vectype
4513     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4514     ? vect_mask_var
4515     : vect_simple_var
4516     : vect_scalar_var;
4517   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4518
4519   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4520
4521   name = get_name (scalar_dest);
4522   if (name)
4523     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4524   else
4525     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4526   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4527   free (new_name);
4528
4529   return vec_dest;
4530 }
4531
4532 /* Function vect_grouped_store_supported.
4533
4534    Returns TRUE if interleave high and interleave low permutations
4535    are supported, and FALSE otherwise.  */
4536
4537 bool
4538 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4539 {
4540   machine_mode mode = TYPE_MODE (vectype);
4541
4542   /* vect_permute_store_chain requires the group size to be equal to 3 or
4543      be a power of two.  */
4544   if (count != 3 && exact_log2 (count) == -1)
4545     {
4546       if (dump_enabled_p ())
4547         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4548                          "the size of the group of accesses"
4549                          " is not a power of 2 or not eqaul to 3\n");
4550       return false;
4551     }
4552
4553   /* Check that the permutation is supported.  */
4554   if (VECTOR_MODE_P (mode))
4555     {
4556       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4557       auto_vec_perm_indices sel (nelt);
4558       sel.quick_grow (nelt);
4559
4560       if (count == 3)
4561         {
4562           unsigned int j0 = 0, j1 = 0, j2 = 0;
4563           unsigned int i, j;
4564
4565           for (j = 0; j < 3; j++)
4566             {
4567               int nelt0 = ((3 - j) * nelt) % 3;
4568               int nelt1 = ((3 - j) * nelt + 1) % 3;
4569               int nelt2 = ((3 - j) * nelt + 2) % 3;
4570               for (i = 0; i < nelt; i++)
4571                 {
4572                   if (3 * i + nelt0 < nelt)
4573                     sel[3 * i + nelt0] = j0++;
4574                   if (3 * i + nelt1 < nelt)
4575                     sel[3 * i + nelt1] = nelt + j1++;
4576                   if (3 * i + nelt2 < nelt)
4577                     sel[3 * i + nelt2] = 0;
4578                 }
4579               if (!can_vec_perm_p (mode, false, &sel))
4580                 {
4581                   if (dump_enabled_p ())
4582                     dump_printf (MSG_MISSED_OPTIMIZATION,
4583                                  "permutaion op not supported by target.\n");
4584                   return false;
4585                 }
4586
4587               for (i = 0; i < nelt; i++)
4588                 {
4589                   if (3 * i + nelt0 < nelt)
4590                     sel[3 * i + nelt0] = 3 * i + nelt0;
4591                   if (3 * i + nelt1 < nelt)
4592                     sel[3 * i + nelt1] = 3 * i + nelt1;
4593                   if (3 * i + nelt2 < nelt)
4594                     sel[3 * i + nelt2] = nelt + j2++;
4595                 }
4596               if (!can_vec_perm_p (mode, false, &sel))
4597                 {
4598                   if (dump_enabled_p ())
4599                     dump_printf (MSG_MISSED_OPTIMIZATION,
4600                                  "permutaion op not supported by target.\n");
4601                   return false;
4602                 }
4603             }
4604           return true;
4605         }
4606       else
4607         {
4608           /* If length is not equal to 3 then only power of 2 is supported.  */
4609           gcc_assert (pow2p_hwi (count));
4610
4611           for (i = 0; i < nelt / 2; i++)
4612             {
4613               sel[i * 2] = i;
4614               sel[i * 2 + 1] = i + nelt;
4615             }
4616           if (can_vec_perm_p (mode, false, &sel))
4617             {
4618               for (i = 0; i < nelt; i++)
4619                 sel[i] += nelt / 2;
4620               if (can_vec_perm_p (mode, false, &sel))
4621                 return true;
4622             }
4623         }
4624     }
4625
4626   if (dump_enabled_p ())
4627     dump_printf (MSG_MISSED_OPTIMIZATION,
4628                  "permutaion op not supported by target.\n");
4629   return false;
4630 }
4631
4632
4633 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4634    type VECTYPE.  */
4635
4636 bool
4637 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4638 {
4639   return vect_lanes_optab_supported_p ("vec_store_lanes",
4640                                        vec_store_lanes_optab,
4641                                        vectype, count);
4642 }
4643
4644
4645 /* Function vect_permute_store_chain.
4646
4647    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4648    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4649    the data correctly for the stores.  Return the final references for stores
4650    in RESULT_CHAIN.
4651
4652    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4653    The input is 4 vectors each containing 8 elements.  We assign a number to
4654    each element, the input sequence is:
4655
4656    1st vec:   0  1  2  3  4  5  6  7
4657    2nd vec:   8  9 10 11 12 13 14 15
4658    3rd vec:  16 17 18 19 20 21 22 23
4659    4th vec:  24 25 26 27 28 29 30 31
4660
4661    The output sequence should be:
4662
4663    1st vec:  0  8 16 24  1  9 17 25
4664    2nd vec:  2 10 18 26  3 11 19 27
4665    3rd vec:  4 12 20 28  5 13 21 30
4666    4th vec:  6 14 22 30  7 15 23 31
4667
4668    i.e., we interleave the contents of the four vectors in their order.
4669
4670    We use interleave_high/low instructions to create such output.  The input of
4671    each interleave_high/low operation is two vectors:
4672    1st vec    2nd vec
4673    0 1 2 3    4 5 6 7
4674    the even elements of the result vector are obtained left-to-right from the
4675    high/low elements of the first vector.  The odd elements of the result are
4676    obtained left-to-right from the high/low elements of the second vector.
4677    The output of interleave_high will be:   0 4 1 5
4678    and of interleave_low:                   2 6 3 7
4679
4680
4681    The permutation is done in log LENGTH stages.  In each stage interleave_high
4682    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4683    where the first argument is taken from the first half of DR_CHAIN and the
4684    second argument from it's second half.
4685    In our example,
4686
4687    I1: interleave_high (1st vec, 3rd vec)
4688    I2: interleave_low (1st vec, 3rd vec)
4689    I3: interleave_high (2nd vec, 4th vec)
4690    I4: interleave_low (2nd vec, 4th vec)
4691
4692    The output for the first stage is:
4693
4694    I1:  0 16  1 17  2 18  3 19
4695    I2:  4 20  5 21  6 22  7 23
4696    I3:  8 24  9 25 10 26 11 27
4697    I4: 12 28 13 29 14 30 15 31
4698
4699    The output of the second stage, i.e. the final result is:
4700
4701    I1:  0  8 16 24  1  9 17 25
4702    I2:  2 10 18 26  3 11 19 27
4703    I3:  4 12 20 28  5 13 21 30
4704    I4:  6 14 22 30  7 15 23 31.  */
4705
4706 void
4707 vect_permute_store_chain (vec<tree> dr_chain,
4708                           unsigned int length,
4709                           gimple *stmt,
4710                           gimple_stmt_iterator *gsi,
4711                           vec<tree> *result_chain)
4712 {
4713   tree vect1, vect2, high, low;
4714   gimple *perm_stmt;
4715   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4716   tree perm_mask_low, perm_mask_high;
4717   tree data_ref;
4718   tree perm3_mask_low, perm3_mask_high;
4719   unsigned int i, n, log_length = exact_log2 (length);
4720   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4721
4722   auto_vec_perm_indices sel (nelt);
4723   sel.quick_grow (nelt);
4724
4725   result_chain->quick_grow (length);
4726   memcpy (result_chain->address (), dr_chain.address (),
4727           length * sizeof (tree));
4728
4729   if (length == 3)
4730     {
4731       unsigned int j0 = 0, j1 = 0, j2 = 0;
4732
4733       for (j = 0; j < 3; j++)
4734         {
4735           int nelt0 = ((3 - j) * nelt) % 3;
4736           int nelt1 = ((3 - j) * nelt + 1) % 3;
4737           int nelt2 = ((3 - j) * nelt + 2) % 3;
4738
4739           for (i = 0; i < nelt; i++)
4740             {
4741               if (3 * i + nelt0 < nelt)
4742                 sel[3 * i + nelt0] = j0++;
4743               if (3 * i + nelt1 < nelt)
4744                 sel[3 * i + nelt1] = nelt + j1++;
4745               if (3 * i + nelt2 < nelt)
4746                 sel[3 * i + nelt2] = 0;
4747             }
4748           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4749
4750           for (i = 0; i < nelt; i++)
4751             {
4752               if (3 * i + nelt0 < nelt)
4753                 sel[3 * i + nelt0] = 3 * i + nelt0;
4754               if (3 * i + nelt1 < nelt)
4755                 sel[3 * i + nelt1] = 3 * i + nelt1;
4756               if (3 * i + nelt2 < nelt)
4757                 sel[3 * i + nelt2] = nelt + j2++;
4758             }
4759           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4760
4761           vect1 = dr_chain[0];
4762           vect2 = dr_chain[1];
4763
4764           /* Create interleaving stmt:
4765              low = VEC_PERM_EXPR <vect1, vect2,
4766                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4767                                    j + 2, nelt + j + 2, *, ...}>  */
4768           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4769           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4770                                            vect2, perm3_mask_low);
4771           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4772
4773           vect1 = data_ref;
4774           vect2 = dr_chain[2];
4775           /* Create interleaving stmt:
4776              low = VEC_PERM_EXPR <vect1, vect2,
4777                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4778                                    6, 7, nelt + j + 2, ...}>  */
4779           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4780           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4781                                            vect2, perm3_mask_high);
4782           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4783           (*result_chain)[j] = data_ref;
4784         }
4785     }
4786   else
4787     {
4788       /* If length is not equal to 3 then only power of 2 is supported.  */
4789       gcc_assert (pow2p_hwi (length));
4790
4791       for (i = 0, n = nelt / 2; i < n; i++)
4792         {
4793           sel[i * 2] = i;
4794           sel[i * 2 + 1] = i + nelt;
4795         }
4796         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4797
4798         for (i = 0; i < nelt; i++)
4799           sel[i] += nelt / 2;
4800         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4801
4802         for (i = 0, n = log_length; i < n; i++)
4803           {
4804             for (j = 0; j < length/2; j++)
4805               {
4806                 vect1 = dr_chain[j];
4807                 vect2 = dr_chain[j+length/2];
4808
4809                 /* Create interleaving stmt:
4810                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4811                                                         ...}>  */
4812                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4813                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4814                                                  vect2, perm_mask_high);
4815                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4816                 (*result_chain)[2*j] = high;
4817
4818                 /* Create interleaving stmt:
4819                    low = VEC_PERM_EXPR <vect1, vect2,
4820                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4821                                          ...}>  */
4822                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4823                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4824                                                  vect2, perm_mask_low);
4825                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4826                 (*result_chain)[2*j+1] = low;
4827               }
4828             memcpy (dr_chain.address (), result_chain->address (),
4829                     length * sizeof (tree));
4830           }
4831     }
4832 }
4833
4834 /* Function vect_setup_realignment
4835
4836    This function is called when vectorizing an unaligned load using
4837    the dr_explicit_realign[_optimized] scheme.
4838    This function generates the following code at the loop prolog:
4839
4840       p = initial_addr;
4841    x  msq_init = *(floor(p));   # prolog load
4842       realignment_token = call target_builtin;
4843     loop:
4844    x  msq = phi (msq_init, ---)
4845
4846    The stmts marked with x are generated only for the case of
4847    dr_explicit_realign_optimized.
4848
4849    The code above sets up a new (vector) pointer, pointing to the first
4850    location accessed by STMT, and a "floor-aligned" load using that pointer.
4851    It also generates code to compute the "realignment-token" (if the relevant
4852    target hook was defined), and creates a phi-node at the loop-header bb
4853    whose arguments are the result of the prolog-load (created by this
4854    function) and the result of a load that takes place in the loop (to be
4855    created by the caller to this function).
4856
4857    For the case of dr_explicit_realign_optimized:
4858    The caller to this function uses the phi-result (msq) to create the
4859    realignment code inside the loop, and sets up the missing phi argument,
4860    as follows:
4861     loop:
4862       msq = phi (msq_init, lsq)
4863       lsq = *(floor(p'));        # load in loop
4864       result = realign_load (msq, lsq, realignment_token);
4865
4866    For the case of dr_explicit_realign:
4867     loop:
4868       msq = *(floor(p));        # load in loop
4869       p' = p + (VS-1);
4870       lsq = *(floor(p'));       # load in loop
4871       result = realign_load (msq, lsq, realignment_token);
4872
4873    Input:
4874    STMT - (scalar) load stmt to be vectorized. This load accesses
4875           a memory location that may be unaligned.
4876    BSI - place where new code is to be inserted.
4877    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4878                               is used.
4879
4880    Output:
4881    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4882                        target hook, if defined.
4883    Return value - the result of the loop-header phi node.  */
4884
4885 tree
4886 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4887                         tree *realignment_token,
4888                         enum dr_alignment_support alignment_support_scheme,
4889                         tree init_addr,
4890                         struct loop **at_loop)
4891 {
4892   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4893   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4894   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4895   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4896   struct loop *loop = NULL;
4897   edge pe = NULL;
4898   tree scalar_dest = gimple_assign_lhs (stmt);
4899   tree vec_dest;
4900   gimple *inc;
4901   tree ptr;
4902   tree data_ref;
4903   basic_block new_bb;
4904   tree msq_init = NULL_TREE;
4905   tree new_temp;
4906   gphi *phi_stmt;
4907   tree msq = NULL_TREE;
4908   gimple_seq stmts = NULL;
4909   bool inv_p;
4910   bool compute_in_loop = false;
4911   bool nested_in_vect_loop = false;
4912   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4913   struct loop *loop_for_initial_load = NULL;
4914
4915   if (loop_vinfo)
4916     {
4917       loop = LOOP_VINFO_LOOP (loop_vinfo);
4918       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4919     }
4920
4921   gcc_assert (alignment_support_scheme == dr_explicit_realign
4922               || alignment_support_scheme == dr_explicit_realign_optimized);
4923
4924   /* We need to generate three things:
4925      1. the misalignment computation
4926      2. the extra vector load (for the optimized realignment scheme).
4927      3. the phi node for the two vectors from which the realignment is
4928       done (for the optimized realignment scheme).  */
4929
4930   /* 1. Determine where to generate the misalignment computation.
4931
4932      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4933      calculation will be generated by this function, outside the loop (in the
4934      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4935      caller, inside the loop.
4936
4937      Background: If the misalignment remains fixed throughout the iterations of
4938      the loop, then both realignment schemes are applicable, and also the
4939      misalignment computation can be done outside LOOP.  This is because we are
4940      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4941      are a multiple of VS (the Vector Size), and therefore the misalignment in
4942      different vectorized LOOP iterations is always the same.
4943      The problem arises only if the memory access is in an inner-loop nested
4944      inside LOOP, which is now being vectorized using outer-loop vectorization.
4945      This is the only case when the misalignment of the memory access may not
4946      remain fixed throughout the iterations of the inner-loop (as explained in
4947      detail in vect_supportable_dr_alignment).  In this case, not only is the
4948      optimized realignment scheme not applicable, but also the misalignment
4949      computation (and generation of the realignment token that is passed to
4950      REALIGN_LOAD) have to be done inside the loop.
4951
4952      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4953      or not, which in turn determines if the misalignment is computed inside
4954      the inner-loop, or outside LOOP.  */
4955
4956   if (init_addr != NULL_TREE || !loop_vinfo)
4957     {
4958       compute_in_loop = true;
4959       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4960     }
4961
4962
4963   /* 2. Determine where to generate the extra vector load.
4964
4965      For the optimized realignment scheme, instead of generating two vector
4966      loads in each iteration, we generate a single extra vector load in the
4967      preheader of the loop, and in each iteration reuse the result of the
4968      vector load from the previous iteration.  In case the memory access is in
4969      an inner-loop nested inside LOOP, which is now being vectorized using
4970      outer-loop vectorization, we need to determine whether this initial vector
4971      load should be generated at the preheader of the inner-loop, or can be
4972      generated at the preheader of LOOP.  If the memory access has no evolution
4973      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4974      to be generated inside LOOP (in the preheader of the inner-loop).  */
4975
4976   if (nested_in_vect_loop)
4977     {
4978       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4979       bool invariant_in_outerloop =
4980             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4981       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4982     }
4983   else
4984     loop_for_initial_load = loop;
4985   if (at_loop)
4986     *at_loop = loop_for_initial_load;
4987
4988   if (loop_for_initial_load)
4989     pe = loop_preheader_edge (loop_for_initial_load);
4990
4991   /* 3. For the case of the optimized realignment, create the first vector
4992       load at the loop preheader.  */
4993
4994   if (alignment_support_scheme == dr_explicit_realign_optimized)
4995     {
4996       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4997       gassign *new_stmt;
4998
4999       gcc_assert (!compute_in_loop);
5000       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5001       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5002                                       NULL_TREE, &init_addr, NULL, &inc,
5003                                       true, &inv_p);
5004       if (TREE_CODE (ptr) == SSA_NAME)
5005         new_temp = copy_ssa_name (ptr);
5006       else
5007         new_temp = make_ssa_name (TREE_TYPE (ptr));
5008       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5009       new_stmt = gimple_build_assign
5010                    (new_temp, BIT_AND_EXPR, ptr,
5011                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5012       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5013       gcc_assert (!new_bb);
5014       data_ref
5015         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5016                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5017       new_stmt = gimple_build_assign (vec_dest, data_ref);
5018       new_temp = make_ssa_name (vec_dest, new_stmt);
5019       gimple_assign_set_lhs (new_stmt, new_temp);
5020       if (pe)
5021         {
5022           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5023           gcc_assert (!new_bb);
5024         }
5025       else
5026          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5027
5028       msq_init = gimple_assign_lhs (new_stmt);
5029     }
5030
5031   /* 4. Create realignment token using a target builtin, if available.
5032       It is done either inside the containing loop, or before LOOP (as
5033       determined above).  */
5034
5035   if (targetm.vectorize.builtin_mask_for_load)
5036     {
5037       gcall *new_stmt;
5038       tree builtin_decl;
5039
5040       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5041       if (!init_addr)
5042         {
5043           /* Generate the INIT_ADDR computation outside LOOP.  */
5044           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5045                                                             NULL_TREE);
5046           if (loop)
5047             {
5048               pe = loop_preheader_edge (loop);
5049               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5050               gcc_assert (!new_bb);
5051             }
5052           else
5053              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5054         }
5055
5056       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5057       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5058       vec_dest =
5059         vect_create_destination_var (scalar_dest,
5060                                      gimple_call_return_type (new_stmt));
5061       new_temp = make_ssa_name (vec_dest, new_stmt);
5062       gimple_call_set_lhs (new_stmt, new_temp);
5063
5064       if (compute_in_loop)
5065         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5066       else
5067         {
5068           /* Generate the misalignment computation outside LOOP.  */
5069           pe = loop_preheader_edge (loop);
5070           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5071           gcc_assert (!new_bb);
5072         }
5073
5074       *realignment_token = gimple_call_lhs (new_stmt);
5075
5076       /* The result of the CALL_EXPR to this builtin is determined from
5077          the value of the parameter and no global variables are touched
5078          which makes the builtin a "const" function.  Requiring the
5079          builtin to have the "const" attribute makes it unnecessary
5080          to call mark_call_clobbered.  */
5081       gcc_assert (TREE_READONLY (builtin_decl));
5082     }
5083
5084   if (alignment_support_scheme == dr_explicit_realign)
5085     return msq;
5086
5087   gcc_assert (!compute_in_loop);
5088   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5089
5090
5091   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5092
5093   pe = loop_preheader_edge (containing_loop);
5094   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5095   msq = make_ssa_name (vec_dest);
5096   phi_stmt = create_phi_node (msq, containing_loop->header);
5097   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5098
5099   return msq;
5100 }
5101
5102
5103 /* Function vect_grouped_load_supported.
5104
5105    COUNT is the size of the load group (the number of statements plus the
5106    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5107    only one statement, with a gap of COUNT - 1.
5108
5109    Returns true if a suitable permute exists.  */
5110
5111 bool
5112 vect_grouped_load_supported (tree vectype, bool single_element_p,
5113                              unsigned HOST_WIDE_INT count)
5114 {
5115   machine_mode mode = TYPE_MODE (vectype);
5116
5117   /* If this is single-element interleaving with an element distance
5118      that leaves unused vector loads around punt - we at least create
5119      very sub-optimal code in that case (and blow up memory,
5120      see PR65518).  */
5121   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5122     {
5123       if (dump_enabled_p ())
5124         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5125                          "single-element interleaving not supported "
5126                          "for not adjacent vector loads\n");
5127       return false;
5128     }
5129
5130   /* vect_permute_load_chain requires the group size to be equal to 3 or
5131      be a power of two.  */
5132   if (count != 3 && exact_log2 (count) == -1)
5133     {
5134       if (dump_enabled_p ())
5135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5136                          "the size of the group of accesses"
5137                          " is not a power of 2 or not equal to 3\n");
5138       return false;
5139     }
5140
5141   /* Check that the permutation is supported.  */
5142   if (VECTOR_MODE_P (mode))
5143     {
5144       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5145       auto_vec_perm_indices sel (nelt);
5146       sel.quick_grow (nelt);
5147
5148       if (count == 3)
5149         {
5150           unsigned int k;
5151           for (k = 0; k < 3; k++)
5152             {
5153               for (i = 0; i < nelt; i++)
5154                 if (3 * i + k < 2 * nelt)
5155                   sel[i] = 3 * i + k;
5156                 else
5157                   sel[i] = 0;
5158               if (!can_vec_perm_p (mode, false, &sel))
5159                 {
5160                   if (dump_enabled_p ())
5161                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5162                                      "shuffle of 3 loads is not supported by"
5163                                      " target\n");
5164                   return false;
5165                 }
5166               for (i = 0, j = 0; i < nelt; i++)
5167                 if (3 * i + k < 2 * nelt)
5168                   sel[i] = i;
5169                 else
5170                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5171               if (!can_vec_perm_p (mode, false, &sel))
5172                 {
5173                   if (dump_enabled_p ())
5174                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5175                                      "shuffle of 3 loads is not supported by"
5176                                      " target\n");
5177                   return false;
5178                 }
5179             }
5180           return true;
5181         }
5182       else
5183         {
5184           /* If length is not equal to 3 then only power of 2 is supported.  */
5185           gcc_assert (pow2p_hwi (count));
5186           for (i = 0; i < nelt; i++)
5187             sel[i] = i * 2;
5188           if (can_vec_perm_p (mode, false, &sel))
5189             {
5190               for (i = 0; i < nelt; i++)
5191                 sel[i] = i * 2 + 1;
5192               if (can_vec_perm_p (mode, false, &sel))
5193                 return true;
5194             }
5195         }
5196     }
5197
5198   if (dump_enabled_p ())
5199     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5200                      "extract even/odd not supported by target\n");
5201   return false;
5202 }
5203
5204 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5205    type VECTYPE.  */
5206
5207 bool
5208 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5209 {
5210   return vect_lanes_optab_supported_p ("vec_load_lanes",
5211                                        vec_load_lanes_optab,
5212                                        vectype, count);
5213 }
5214
5215 /* Function vect_permute_load_chain.
5216
5217    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5218    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5219    the input data correctly.  Return the final references for loads in
5220    RESULT_CHAIN.
5221
5222    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5223    The input is 4 vectors each containing 8 elements. We assign a number to each
5224    element, the input sequence is:
5225
5226    1st vec:   0  1  2  3  4  5  6  7
5227    2nd vec:   8  9 10 11 12 13 14 15
5228    3rd vec:  16 17 18 19 20 21 22 23
5229    4th vec:  24 25 26 27 28 29 30 31
5230
5231    The output sequence should be:
5232
5233    1st vec:  0 4  8 12 16 20 24 28
5234    2nd vec:  1 5  9 13 17 21 25 29
5235    3rd vec:  2 6 10 14 18 22 26 30
5236    4th vec:  3 7 11 15 19 23 27 31
5237
5238    i.e., the first output vector should contain the first elements of each
5239    interleaving group, etc.
5240
5241    We use extract_even/odd instructions to create such output.  The input of
5242    each extract_even/odd operation is two vectors
5243    1st vec    2nd vec
5244    0 1 2 3    4 5 6 7
5245
5246    and the output is the vector of extracted even/odd elements.  The output of
5247    extract_even will be:   0 2 4 6
5248    and of extract_odd:     1 3 5 7
5249
5250
5251    The permutation is done in log LENGTH stages.  In each stage extract_even
5252    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5253    their order.  In our example,
5254
5255    E1: extract_even (1st vec, 2nd vec)
5256    E2: extract_odd (1st vec, 2nd vec)
5257    E3: extract_even (3rd vec, 4th vec)
5258    E4: extract_odd (3rd vec, 4th vec)
5259
5260    The output for the first stage will be:
5261
5262    E1:  0  2  4  6  8 10 12 14
5263    E2:  1  3  5  7  9 11 13 15
5264    E3: 16 18 20 22 24 26 28 30
5265    E4: 17 19 21 23 25 27 29 31
5266
5267    In order to proceed and create the correct sequence for the next stage (or
5268    for the correct output, if the second stage is the last one, as in our
5269    example), we first put the output of extract_even operation and then the
5270    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5271    The input for the second stage is:
5272
5273    1st vec (E1):  0  2  4  6  8 10 12 14
5274    2nd vec (E3): 16 18 20 22 24 26 28 30
5275    3rd vec (E2):  1  3  5  7  9 11 13 15
5276    4th vec (E4): 17 19 21 23 25 27 29 31
5277
5278    The output of the second stage:
5279
5280    E1: 0 4  8 12 16 20 24 28
5281    E2: 2 6 10 14 18 22 26 30
5282    E3: 1 5  9 13 17 21 25 29
5283    E4: 3 7 11 15 19 23 27 31
5284
5285    And RESULT_CHAIN after reordering:
5286
5287    1st vec (E1):  0 4  8 12 16 20 24 28
5288    2nd vec (E3):  1 5  9 13 17 21 25 29
5289    3rd vec (E2):  2 6 10 14 18 22 26 30
5290    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5291
5292 static void
5293 vect_permute_load_chain (vec<tree> dr_chain,
5294                          unsigned int length,
5295                          gimple *stmt,
5296                          gimple_stmt_iterator *gsi,
5297                          vec<tree> *result_chain)
5298 {
5299   tree data_ref, first_vect, second_vect;
5300   tree perm_mask_even, perm_mask_odd;
5301   tree perm3_mask_low, perm3_mask_high;
5302   gimple *perm_stmt;
5303   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5304   unsigned int i, j, log_length = exact_log2 (length);
5305   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5306
5307   auto_vec_perm_indices sel (nelt);
5308   sel.quick_grow (nelt);
5309
5310   result_chain->quick_grow (length);
5311   memcpy (result_chain->address (), dr_chain.address (),
5312           length * sizeof (tree));
5313
5314   if (length == 3)
5315     {
5316       unsigned int k;
5317
5318       for (k = 0; k < 3; k++)
5319         {
5320           for (i = 0; i < nelt; i++)
5321             if (3 * i + k < 2 * nelt)
5322               sel[i] = 3 * i + k;
5323             else
5324               sel[i] = 0;
5325           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5326
5327           for (i = 0, j = 0; i < nelt; i++)
5328             if (3 * i + k < 2 * nelt)
5329               sel[i] = i;
5330             else
5331               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5332
5333           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5334
5335           first_vect = dr_chain[0];
5336           second_vect = dr_chain[1];
5337
5338           /* Create interleaving stmt (low part of):
5339              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5340                                                              ...}>  */
5341           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5342           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5343                                            second_vect, perm3_mask_low);
5344           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5345
5346           /* Create interleaving stmt (high part of):
5347              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5348                                                               ...}>  */
5349           first_vect = data_ref;
5350           second_vect = dr_chain[2];
5351           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5352           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5353                                            second_vect, perm3_mask_high);
5354           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5355           (*result_chain)[k] = data_ref;
5356         }
5357     }
5358   else
5359     {
5360       /* If length is not equal to 3 then only power of 2 is supported.  */
5361       gcc_assert (pow2p_hwi (length));
5362
5363       for (i = 0; i < nelt; ++i)
5364         sel[i] = i * 2;
5365       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5366
5367       for (i = 0; i < nelt; ++i)
5368         sel[i] = i * 2 + 1;
5369       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5370
5371       for (i = 0; i < log_length; i++)
5372         {
5373           for (j = 0; j < length; j += 2)
5374             {
5375               first_vect = dr_chain[j];
5376               second_vect = dr_chain[j+1];
5377
5378               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5379               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5380               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5381                                                first_vect, second_vect,
5382                                                perm_mask_even);
5383               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5384               (*result_chain)[j/2] = data_ref;
5385
5386               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5387               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5388               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5389                                                first_vect, second_vect,
5390                                                perm_mask_odd);
5391               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392               (*result_chain)[j/2+length/2] = data_ref;
5393             }
5394           memcpy (dr_chain.address (), result_chain->address (),
5395                   length * sizeof (tree));
5396         }
5397     }
5398 }
5399
5400 /* Function vect_shift_permute_load_chain.
5401
5402    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5403    sequence of stmts to reorder the input data accordingly.
5404    Return the final references for loads in RESULT_CHAIN.
5405    Return true if successed, false otherwise.
5406
5407    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5408    The input is 3 vectors each containing 8 elements.  We assign a
5409    number to each element, the input sequence is:
5410
5411    1st vec:   0  1  2  3  4  5  6  7
5412    2nd vec:   8  9 10 11 12 13 14 15
5413    3rd vec:  16 17 18 19 20 21 22 23
5414
5415    The output sequence should be:
5416
5417    1st vec:  0 3 6  9 12 15 18 21
5418    2nd vec:  1 4 7 10 13 16 19 22
5419    3rd vec:  2 5 8 11 14 17 20 23
5420
5421    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5422
5423    First we shuffle all 3 vectors to get correct elements order:
5424
5425    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5426    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5427    3rd vec:  (16 19 22) (17 20 23) (18 21)
5428
5429    Next we unite and shift vector 3 times:
5430
5431    1st step:
5432      shift right by 6 the concatenation of:
5433      "1st vec" and  "2nd vec"
5434        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5435      "2nd vec" and  "3rd vec"
5436        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5437      "3rd vec" and  "1st vec"
5438        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5439                              | New vectors                   |
5440
5441      So that now new vectors are:
5442
5443      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5444      2nd vec:  (10 13) (16 19 22) (17 20 23)
5445      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5446
5447    2nd step:
5448      shift right by 5 the concatenation of:
5449      "1st vec" and  "3rd vec"
5450        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5451      "2nd vec" and  "1st vec"
5452        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5453      "3rd vec" and  "2nd vec"
5454        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5455                           | New vectors                   |
5456
5457      So that now new vectors are:
5458
5459      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5460      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5461      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5462
5463    3rd step:
5464      shift right by 5 the concatenation of:
5465      "1st vec" and  "1st vec"
5466        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5467      shift right by 3 the concatenation of:
5468      "2nd vec" and  "2nd vec"
5469                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5470                           | New vectors                   |
5471
5472      So that now all vectors are READY:
5473      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5474      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5475      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5476
5477    This algorithm is faster than one in vect_permute_load_chain if:
5478      1.  "shift of a concatination" is faster than general permutation.
5479          This is usually so.
5480      2.  The TARGET machine can't execute vector instructions in parallel.
5481          This is because each step of the algorithm depends on previous.
5482          The algorithm in vect_permute_load_chain is much more parallel.
5483
5484    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5485 */
5486
5487 static bool
5488 vect_shift_permute_load_chain (vec<tree> dr_chain,
5489                                unsigned int length,
5490                                gimple *stmt,
5491                                gimple_stmt_iterator *gsi,
5492                                vec<tree> *result_chain)
5493 {
5494   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5495   tree perm2_mask1, perm2_mask2, perm3_mask;
5496   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5497   gimple *perm_stmt;
5498
5499   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5500   unsigned int i;
5501   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5502   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5503   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5504
5505   auto_vec_perm_indices sel (nelt);
5506   sel.quick_grow (nelt);
5507
5508   result_chain->quick_grow (length);
5509   memcpy (result_chain->address (), dr_chain.address (),
5510           length * sizeof (tree));
5511
5512   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5513     {
5514       unsigned int j, log_length = exact_log2 (length);
5515       for (i = 0; i < nelt / 2; ++i)
5516         sel[i] = i * 2;
5517       for (i = 0; i < nelt / 2; ++i)
5518         sel[nelt / 2 + i] = i * 2 + 1;
5519       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5520         {
5521           if (dump_enabled_p ())
5522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523                              "shuffle of 2 fields structure is not \
5524                               supported by target\n");
5525           return false;
5526         }
5527       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5528
5529       for (i = 0; i < nelt / 2; ++i)
5530         sel[i] = i * 2 + 1;
5531       for (i = 0; i < nelt / 2; ++i)
5532         sel[nelt / 2 + i] = i * 2;
5533       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5534         {
5535           if (dump_enabled_p ())
5536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537                              "shuffle of 2 fields structure is not \
5538                               supported by target\n");
5539           return false;
5540         }
5541       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5542
5543       /* Generating permutation constant to shift all elements.
5544          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5545       for (i = 0; i < nelt; i++)
5546         sel[i] = nelt / 2 + i;
5547       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5548         {
5549           if (dump_enabled_p ())
5550             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5551                              "shift permutation is not supported by target\n");
5552           return false;
5553         }
5554       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5555
5556       /* Generating permutation constant to select vector from 2.
5557          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5558       for (i = 0; i < nelt / 2; i++)
5559         sel[i] = i;
5560       for (i = nelt / 2; i < nelt; i++)
5561         sel[i] = nelt + i;
5562       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5563         {
5564           if (dump_enabled_p ())
5565             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5566                              "select is not supported by target\n");
5567           return false;
5568         }
5569       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5570
5571       for (i = 0; i < log_length; i++)
5572         {
5573           for (j = 0; j < length; j += 2)
5574             {
5575               first_vect = dr_chain[j];
5576               second_vect = dr_chain[j + 1];
5577
5578               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5579               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5580                                                first_vect, first_vect,
5581                                                perm2_mask1);
5582               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5583               vect[0] = data_ref;
5584
5585               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5586               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5587                                                second_vect, second_vect,
5588                                                perm2_mask2);
5589               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5590               vect[1] = data_ref;
5591
5592               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5593               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5594                                                vect[0], vect[1], shift1_mask);
5595               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5596               (*result_chain)[j/2 + length/2] = data_ref;
5597
5598               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5599               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5600                                                vect[0], vect[1], select_mask);
5601               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5602               (*result_chain)[j/2] = data_ref;
5603             }
5604           memcpy (dr_chain.address (), result_chain->address (),
5605                   length * sizeof (tree));
5606         }
5607       return true;
5608     }
5609   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5610     {
5611       unsigned int k = 0, l = 0;
5612
5613       /* Generating permutation constant to get all elements in rigth order.
5614          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5615       for (i = 0; i < nelt; i++)
5616         {
5617           if (3 * k + (l % 3) >= nelt)
5618             {
5619               k = 0;
5620               l += (3 - (nelt % 3));
5621             }
5622           sel[i] = 3 * k + (l % 3);
5623           k++;
5624         }
5625       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5626         {
5627           if (dump_enabled_p ())
5628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5629                              "shuffle of 3 fields structure is not \
5630                               supported by target\n");
5631           return false;
5632         }
5633       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5634
5635       /* Generating permutation constant to shift all elements.
5636          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5637       for (i = 0; i < nelt; i++)
5638         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5639       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5640         {
5641           if (dump_enabled_p ())
5642             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5643                              "shift permutation is not supported by target\n");
5644           return false;
5645         }
5646       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5647
5648       /* Generating permutation constant to shift all elements.
5649          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5650       for (i = 0; i < nelt; i++)
5651         sel[i] = 2 * (nelt / 3) + 1 + i;
5652       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5653         {
5654           if (dump_enabled_p ())
5655             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5656                              "shift permutation is not supported by target\n");
5657           return false;
5658         }
5659       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5660
5661       /* Generating permutation constant to shift all elements.
5662          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5663       for (i = 0; i < nelt; i++)
5664         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5665       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5666         {
5667           if (dump_enabled_p ())
5668             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669                              "shift permutation is not supported by target\n");
5670           return false;
5671         }
5672       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5673
5674       /* Generating permutation constant to shift all elements.
5675          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5676       for (i = 0; i < nelt; i++)
5677         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5678       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5679         {
5680           if (dump_enabled_p ())
5681             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5682                              "shift permutation is not supported by target\n");
5683           return false;
5684         }
5685       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5686
5687       for (k = 0; k < 3; k++)
5688         {
5689           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5690           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5691                                            dr_chain[k], dr_chain[k],
5692                                            perm3_mask);
5693           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5694           vect[k] = data_ref;
5695         }
5696
5697       for (k = 0; k < 3; k++)
5698         {
5699           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5700           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5701                                            vect[k % 3], vect[(k + 1) % 3],
5702                                            shift1_mask);
5703           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5704           vect_shift[k] = data_ref;
5705         }
5706
5707       for (k = 0; k < 3; k++)
5708         {
5709           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5710           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5711                                            vect_shift[(4 - k) % 3],
5712                                            vect_shift[(3 - k) % 3],
5713                                            shift2_mask);
5714           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5715           vect[k] = data_ref;
5716         }
5717
5718       (*result_chain)[3 - (nelt % 3)] = vect[2];
5719
5720       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5721       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5722                                        vect[0], shift3_mask);
5723       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5724       (*result_chain)[nelt % 3] = data_ref;
5725
5726       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5727       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5728                                        vect[1], shift4_mask);
5729       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5730       (*result_chain)[0] = data_ref;
5731       return true;
5732     }
5733   return false;
5734 }
5735
5736 /* Function vect_transform_grouped_load.
5737
5738    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5739    to perform their permutation and ascribe the result vectorized statements to
5740    the scalar statements.
5741 */
5742
5743 void
5744 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5745                              gimple_stmt_iterator *gsi)
5746 {
5747   machine_mode mode;
5748   vec<tree> result_chain = vNULL;
5749
5750   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5751      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5752      vectors, that are ready for vector computation.  */
5753   result_chain.create (size);
5754
5755   /* If reassociation width for vector type is 2 or greater target machine can
5756      execute 2 or more vector instructions in parallel.  Otherwise try to
5757      get chain for loads group using vect_shift_permute_load_chain.  */
5758   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5759   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5760       || pow2p_hwi (size)
5761       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5762                                          gsi, &result_chain))
5763     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5764   vect_record_grouped_load_vectors (stmt, result_chain);
5765   result_chain.release ();
5766 }
5767
5768 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5769    generated as part of the vectorization of STMT.  Assign the statement
5770    for each vector to the associated scalar statement.  */
5771
5772 void
5773 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5774 {
5775   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5776   gimple *next_stmt, *new_stmt;
5777   unsigned int i, gap_count;
5778   tree tmp_data_ref;
5779
5780   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5781      Since we scan the chain starting from it's first node, their order
5782      corresponds the order of data-refs in RESULT_CHAIN.  */
5783   next_stmt = first_stmt;
5784   gap_count = 1;
5785   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5786     {
5787       if (!next_stmt)
5788         break;
5789
5790       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5791        code elimination pass later.  No need to check for the first stmt in
5792        the group, since it always exists.
5793        GROUP_GAP is the number of steps in elements from the previous
5794        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5795        correspond to the gaps.  */
5796       if (next_stmt != first_stmt
5797           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5798       {
5799         gap_count++;
5800         continue;
5801       }
5802
5803       while (next_stmt)
5804         {
5805           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5806           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5807              copies, and we put the new vector statement in the first available
5808              RELATED_STMT.  */
5809           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5810             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5811           else
5812             {
5813               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5814                 {
5815                   gimple *prev_stmt =
5816                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5817                   gimple *rel_stmt =
5818                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5819                   while (rel_stmt)
5820                     {
5821                       prev_stmt = rel_stmt;
5822                       rel_stmt =
5823                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5824                     }
5825
5826                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5827                     new_stmt;
5828                 }
5829             }
5830
5831           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5832           gap_count = 1;
5833           /* If NEXT_STMT accesses the same DR as the previous statement,
5834              put the same TMP_DATA_REF as its vectorized statement; otherwise
5835              get the next data-ref from RESULT_CHAIN.  */
5836           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5837             break;
5838         }
5839     }
5840 }
5841
5842 /* Function vect_force_dr_alignment_p.
5843
5844    Returns whether the alignment of a DECL can be forced to be aligned
5845    on ALIGNMENT bit boundary.  */
5846
5847 bool
5848 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5849 {
5850   if (!VAR_P (decl))
5851     return false;
5852
5853   if (decl_in_symtab_p (decl)
5854       && !symtab_node::get (decl)->can_increase_alignment_p ())
5855     return false;
5856
5857   if (TREE_STATIC (decl))
5858     return (alignment <= MAX_OFILE_ALIGNMENT);
5859   else
5860     return (alignment <= MAX_STACK_ALIGNMENT);
5861 }
5862
5863
5864 /* Return whether the data reference DR is supported with respect to its
5865    alignment.
5866    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5867    it is aligned, i.e., check if it is possible to vectorize it with different
5868    alignment.  */
5869
5870 enum dr_alignment_support
5871 vect_supportable_dr_alignment (struct data_reference *dr,
5872                                bool check_aligned_accesses)
5873 {
5874   gimple *stmt = DR_STMT (dr);
5875   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5876   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5877   machine_mode mode = TYPE_MODE (vectype);
5878   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5879   struct loop *vect_loop = NULL;
5880   bool nested_in_vect_loop = false;
5881
5882   if (aligned_access_p (dr) && !check_aligned_accesses)
5883     return dr_aligned;
5884
5885   /* For now assume all conditional loads/stores support unaligned
5886      access without any special code.  */
5887   if (is_gimple_call (stmt)
5888       && gimple_call_internal_p (stmt)
5889       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5890           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5891     return dr_unaligned_supported;
5892
5893   if (loop_vinfo)
5894     {
5895       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5896       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5897     }
5898
5899   /* Possibly unaligned access.  */
5900
5901   /* We can choose between using the implicit realignment scheme (generating
5902      a misaligned_move stmt) and the explicit realignment scheme (generating
5903      aligned loads with a REALIGN_LOAD).  There are two variants to the
5904      explicit realignment scheme: optimized, and unoptimized.
5905      We can optimize the realignment only if the step between consecutive
5906      vector loads is equal to the vector size.  Since the vector memory
5907      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5908      is guaranteed that the misalignment amount remains the same throughout the
5909      execution of the vectorized loop.  Therefore, we can create the
5910      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5911      at the loop preheader.
5912
5913      However, in the case of outer-loop vectorization, when vectorizing a
5914      memory access in the inner-loop nested within the LOOP that is now being
5915      vectorized, while it is guaranteed that the misalignment of the
5916      vectorized memory access will remain the same in different outer-loop
5917      iterations, it is *not* guaranteed that is will remain the same throughout
5918      the execution of the inner-loop.  This is because the inner-loop advances
5919      with the original scalar step (and not in steps of VS).  If the inner-loop
5920      step happens to be a multiple of VS, then the misalignment remains fixed
5921      and we can use the optimized realignment scheme.  For example:
5922
5923       for (i=0; i<N; i++)
5924         for (j=0; j<M; j++)
5925           s += a[i+j];
5926
5927      When vectorizing the i-loop in the above example, the step between
5928      consecutive vector loads is 1, and so the misalignment does not remain
5929      fixed across the execution of the inner-loop, and the realignment cannot
5930      be optimized (as illustrated in the following pseudo vectorized loop):
5931
5932       for (i=0; i<N; i+=4)
5933         for (j=0; j<M; j++){
5934           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5935                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5936                          // (assuming that we start from an aligned address).
5937           }
5938
5939      We therefore have to use the unoptimized realignment scheme:
5940
5941       for (i=0; i<N; i+=4)
5942           for (j=k; j<M; j+=4)
5943           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5944                            // that the misalignment of the initial address is
5945                            // 0).
5946
5947      The loop can then be vectorized as follows:
5948
5949       for (k=0; k<4; k++){
5950         rt = get_realignment_token (&vp[k]);
5951         for (i=0; i<N; i+=4){
5952           v1 = vp[i+k];
5953           for (j=k; j<M; j+=4){
5954             v2 = vp[i+j+VS-1];
5955             va = REALIGN_LOAD <v1,v2,rt>;
5956             vs += va;
5957             v1 = v2;
5958           }
5959         }
5960     } */
5961
5962   if (DR_IS_READ (dr))
5963     {
5964       bool is_packed = false;
5965       tree type = (TREE_TYPE (DR_REF (dr)));
5966
5967       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5968           && (!targetm.vectorize.builtin_mask_for_load
5969               || targetm.vectorize.builtin_mask_for_load ()))
5970         {
5971           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5972
5973           /* If we are doing SLP then the accesses need not have the
5974              same alignment, instead it depends on the SLP group size.  */
5975           if (loop_vinfo
5976               && STMT_SLP_TYPE (stmt_info)
5977               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5978                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5979                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5980             ;
5981           else if (!loop_vinfo
5982                    || (nested_in_vect_loop
5983                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5984                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5985             return dr_explicit_realign;
5986           else
5987             return dr_explicit_realign_optimized;
5988         }
5989       if (!known_alignment_for_access_p (dr))
5990         is_packed = not_size_aligned (DR_REF (dr));
5991
5992       if (targetm.vectorize.support_vector_misalignment
5993             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5994         /* Can't software pipeline the loads, but can at least do them.  */
5995         return dr_unaligned_supported;
5996     }
5997   else
5998     {
5999       bool is_packed = false;
6000       tree type = (TREE_TYPE (DR_REF (dr)));
6001
6002       if (!known_alignment_for_access_p (dr))
6003         is_packed = not_size_aligned (DR_REF (dr));
6004
6005      if (targetm.vectorize.support_vector_misalignment
6006            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6007        return dr_unaligned_supported;
6008     }
6009
6010   /* Unsupported.  */
6011   return dr_unaligned_unsupported;
6012 }