gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55
  56 /* Return true if load- or store-lanes optab OPTAB is implemented for
  57    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  58
  59 static bool
  60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  61                               tree vectype, unsigned HOST_WIDE_INT count)
  62 {
  63   machine_mode mode;
  64   scalar_int_mode array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   limit_p = !targetm.array_mode_supported_p (mode, count);
  69   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  70                           limit_p).exists (&array_mode))
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   /* During the analysis phase, this function is called on arbitrary
 122      statements that might not have scalar results.  */
 123   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 124     return scalar_type;
 125
 126   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 127
 128   if (is_gimple_assign (stmt)
 129       && (gimple_assign_cast_p (stmt)
 130           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 131           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 132           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 133     {
 134       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 135
 136       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 137       if (rhs < lhs)
 138         scalar_type = rhs_type;
 139     }
 140
 141   *lhs_size_unit = lhs;
 142   *rhs_size_unit = rhs;
 143   return scalar_type;
 144 }
 145
 146
 147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 148    tested at run-time.  Return TRUE if DDR was successfully inserted.
 149    Return false if versioning is not supported.  */
 150
 151 static bool
 152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 153 {
 154   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 155
 156   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 157     return false;
 158
 159   if (!runtime_alias_check_p (ddr, loop,
 160                               optimize_loop_nest_for_speed_p (loop)))
 161     return false;
 162
 163   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 164   return true;
 165 }
 166
 167
 168 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 169    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 170    distances.  These distances are conservatively correct but they don't
 171    reflect a guaranteed dependence.
 172
 173    Return true if this function does all the work necessary to avoid
 174    an alias or false if the caller should use the dependence distances
 175    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 176    the depth of the loop described by LOOP_VINFO and the other arguments
 177    are as for vect_analyze_data_ref_dependence.  */
 178
 179 static bool
 180 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 181                                        loop_vec_info loop_vinfo,
 182                                        int loop_depth, int *max_vf)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   lambda_vector dist_v;
 186   unsigned int i;
 187   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 188     {
 189       int dist = dist_v[loop_depth];
 190       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 191         {
 192           /* If the user asserted safelen >= DIST consecutive iterations
 193              can be executed concurrently, assume independence.
 194
 195              ??? An alternative would be to add the alias check even
 196              in this case, and vectorize the fallback loop with the
 197              maximum VF set to safelen.  However, if the user has
 198              explicitly given a length, it's less likely that that
 199              would be a win.  */
 200           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 201             {
 202               if (loop->safelen < *max_vf)
 203                 *max_vf = loop->safelen;
 204               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 205               continue;
 206             }
 207
 208           /* For dependence distances of 2 or more, we have the option
 209              of limiting VF or checking for an alias at runtime.
 210              Prefer to check at runtime if we can, to avoid limiting
 211              the VF unnecessarily when the bases are in fact independent.
 212
 213              Note that the alias checks will be removed if the VF ends up
 214              being small enough.  */
 215           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 216         }
 217     }
 218   return true;
 219 }
 220
 221
 222 /* Function vect_analyze_data_ref_dependence.
 223
 224    Return TRUE if there (might) exist a dependence between a memory-reference
 225    DRA and a memory-reference DRB.  When versioning for alias may check a
 226    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 227    the data dependence.  */
 228
 229 static bool
 230 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 231                                   loop_vec_info loop_vinfo, int *max_vf)
 232 {
 233   unsigned int i;
 234   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 235   struct data_reference *dra = DDR_A (ddr);
 236   struct data_reference *drb = DDR_B (ddr);
 237   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 238   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 239   lambda_vector dist_v;
 240   unsigned int loop_depth;
 241
 242   /* In loop analysis all data references should be vectorizable.  */
 243   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 244       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 245     gcc_unreachable ();
 246
 247   /* Independent data accesses.  */
 248   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 249     return false;
 250
 251   if (dra == drb
 252       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 253     return false;
 254
 255   /* We do not have to consider dependences between accesses that belong
 256      to the same group.  */
 257   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 258       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 259     return false;
 260
 261   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 262      least two scalar iterations, there is always also a true dependence.
 263      As the vectorizer does not re-order loads and stores we can ignore
 264      the anti-dependence if TBAA can disambiguate both DRs similar to the
 265      case with known negative distance anti-dependences (positive
 266      distance anti-dependences would violate TBAA constraints).  */
 267   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 268        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 269       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 270                                  get_alias_set (DR_REF (drb))))
 271     return false;
 272
 273   /* Unknown data dependence.  */
 274   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 275     {
 276       /* If user asserted safelen consecutive iterations can be
 277          executed concurrently, assume independence.  */
 278       if (loop->safelen >= 2)
 279         {
 280           if (loop->safelen < *max_vf)
 281             *max_vf = loop->safelen;
 282           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 283           return false;
 284         }
 285
 286       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 287           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 288         {
 289           if (dump_enabled_p ())
 290             {
 291               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 292                                "versioning for alias not supported for: "
 293                                "can't determine dependence between ");
 294               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 295                                  DR_REF (dra));
 296               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 297               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 298                                  DR_REF (drb));
 299               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 300             }
 301           return true;
 302         }
 303
 304       if (dump_enabled_p ())
 305         {
 306           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 307                            "versioning for alias required: "
 308                            "can't determine dependence between ");
 309           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 310                              DR_REF (dra));
 311           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 312           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 313                              DR_REF (drb));
 314           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 315         }
 316
 317       /* Add to list of ddrs that need to be tested at run-time.  */
 318       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 319     }
 320
 321   /* Known data dependence.  */
 322   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 323     {
 324       /* If user asserted safelen consecutive iterations can be
 325          executed concurrently, assume independence.  */
 326       if (loop->safelen >= 2)
 327         {
 328           if (loop->safelen < *max_vf)
 329             *max_vf = loop->safelen;
 330           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 331           return false;
 332         }
 333
 334       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 335           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 336         {
 337           if (dump_enabled_p ())
 338             {
 339               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 340                                "versioning for alias not supported for: "
 341                                "bad dist vector for ");
 342               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                  DR_REF (dra));
 344               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 345               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 346                                  DR_REF (drb));
 347               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 348             }
 349           return true;
 350         }
 351
 352       if (dump_enabled_p ())
 353         {
 354           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 355                            "versioning for alias required: "
 356                            "bad dist vector for ");
 357           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 358           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 360           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 361         }
 362       /* Add to list of ddrs that need to be tested at run-time.  */
 363       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 364     }
 365
 366   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 367
 368   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 369       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 370                                                 loop_depth, max_vf))
 371     return false;
 372
 373   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 374     {
 375       int dist = dist_v[loop_depth];
 376
 377       if (dump_enabled_p ())
 378         dump_printf_loc (MSG_NOTE, vect_location,
 379                          "dependence distance  = %d.\n", dist);
 380
 381       if (dist == 0)
 382         {
 383           if (dump_enabled_p ())
 384             {
 385               dump_printf_loc (MSG_NOTE, vect_location,
 386                                "dependence distance == 0 between ");
 387               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 388               dump_printf (MSG_NOTE, " and ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 390               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 391             }
 392
 393           /* When we perform grouped accesses and perform implicit CSE
 394              by detecting equal accesses and doing disambiguation with
 395              runtime alias tests like for
 396                 .. = a[i];
 397                 .. = a[i+1];
 398                 a[i] = ..;
 399                 a[i+1] = ..;
 400                 *p = ..;
 401                 .. = a[i];
 402                 .. = a[i+1];
 403              where we will end up loading { a[i], a[i+1] } once, make
 404              sure that inserting group loads before the first load and
 405              stores after the last store will do the right thing.
 406              Similar for groups like
 407                 a[i] = ...;
 408                 ... = a[i];
 409                 a[i+1] = ...;
 410              where loads from the group interleave with the store.  */
 411           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 412               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 413             {
 414               gimple *earlier_stmt;
 415               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 416               if (DR_IS_WRITE
 417                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 418                 {
 419                   if (dump_enabled_p ())
 420                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 421                                      "READ_WRITE dependence in interleaving."
 422                                      "\n");
 423                   return true;
 424                 }
 425             }
 426
 427           continue;
 428         }
 429
 430       if (dist > 0 && DDR_REVERSED_P (ddr))
 431         {
 432           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 433              reversed (to make distance vector positive), and the actual
 434              distance is negative.  */
 435           if (dump_enabled_p ())
 436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                              "dependence distance negative.\n");
 438           /* Record a negative dependence distance to later limit the
 439              amount of stmt copying / unrolling we can perform.
 440              Only need to handle read-after-write dependence.  */
 441           if (DR_IS_READ (drb)
 442               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 443                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 444             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 445           continue;
 446         }
 447
 448       if (abs (dist) >= 2
 449           && abs (dist) < *max_vf)
 450         {
 451           /* The dependence distance requires reduction of the maximal
 452              vectorization factor.  */
 453           *max_vf = abs (dist);
 454           if (dump_enabled_p ())
 455             dump_printf_loc (MSG_NOTE, vect_location,
 456                              "adjusting maximal vectorization factor to %i\n",
 457                              *max_vf);
 458         }
 459
 460       if (abs (dist) >= *max_vf)
 461         {
 462           /* Dependence distance does not create dependence, as far as
 463              vectorization is concerned, in this case.  */
 464           if (dump_enabled_p ())
 465             dump_printf_loc (MSG_NOTE, vect_location,
 466                              "dependence distance >= VF.\n");
 467           continue;
 468         }
 469
 470       if (dump_enabled_p ())
 471         {
 472           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 473                        "not vectorized, possible dependence "
 474                        "between data-refs ");
 475           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 476           dump_printf (MSG_NOTE,  " and ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 478           dump_printf (MSG_NOTE,  "\n");
 479         }
 480
 481       return true;
 482     }
 483
 484   return false;
 485 }
 486
 487 /* Function vect_analyze_data_ref_dependences.
 488
 489    Examine all the data references in the loop, and make sure there do not
 490    exist any data dependences between them.  Set *MAX_VF according to
 491    the maximum vectorization factor the data dependences allow.  */
 492
 493 bool
 494 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 495 {
 496   unsigned int i;
 497   struct data_dependence_relation *ddr;
 498
 499   if (dump_enabled_p ())
 500     dump_printf_loc (MSG_NOTE, vect_location,
 501                      "=== vect_analyze_data_ref_dependences ===\n");
 502
 503   LOOP_VINFO_DDRS (loop_vinfo)
 504     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 505              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 506   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 507   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 508   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 509                                 &LOOP_VINFO_DDRS (loop_vinfo),
 510                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 511     return false;
 512
 513   /* For epilogues we either have no aliases or alias versioning
 514      was applied to original loop.  Therefore we may just get max_vf
 515      using VF of original loop.  */
 516   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 517     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 518   else
 519     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 520       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 521         return false;
 522
 523   return true;
 524 }
 525
 526
 527 /* Function vect_slp_analyze_data_ref_dependence.
 528
 529    Return TRUE if there (might) exist a dependence between a memory-reference
 530    DRA and a memory-reference DRB.  When versioning for alias may check a
 531    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 532    the data dependence.  */
 533
 534 static bool
 535 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 536 {
 537   struct data_reference *dra = DDR_A (ddr);
 538   struct data_reference *drb = DDR_B (ddr);
 539
 540   /* We need to check dependences of statements marked as unvectorizable
 541      as well, they still can prohibit vectorization.  */
 542
 543   /* Independent data accesses.  */
 544   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 545     return false;
 546
 547   if (dra == drb)
 548     return false;
 549
 550   /* Read-read is OK.  */
 551   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 552     return false;
 553
 554   /* If dra and drb are part of the same interleaving chain consider
 555      them independent.  */
 556   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 557       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 558           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 559     return false;
 560
 561   /* Unknown data dependence.  */
 562   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 563     {
 564       if  (dump_enabled_p ())
 565         {
 566           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 567                            "can't determine dependence between ");
 568           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 569           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 570           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 571           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 572         }
 573     }
 574   else if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location,
 577                        "determined dependence between ");
 578       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 579       dump_printf (MSG_NOTE, " and ");
 580       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 581       dump_printf (MSG_NOTE,  "\n");
 582     }
 583
 584   return true;
 585 }
 586
 587
 588 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 589    contain the vector of scalar stores of this instance if we are
 590    disambiguating the loads.  */
 591
 592 static bool
 593 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 594                                    vec<gimple *> stores, gimple *last_store)
 595 {
 596   /* This walks over all stmts involved in the SLP load/store done
 597      in NODE verifying we can sink them up to the last stmt in the
 598      group.  */
 599   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 600   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 601     {
 602       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 603       if (access == last_access)
 604         continue;
 605       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 606       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 607            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 608         {
 609           gimple *stmt = gsi_stmt (gsi);
 610           if (! gimple_vuse (stmt)
 611               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 612             continue;
 613
 614           /* If we couldn't record a (single) data reference for this
 615              stmt we have to give up.  */
 616           /* ???  Here and below if dependence analysis fails we can resort
 617              to the alias oracle which can handle more kinds of stmts.  */
 618           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 619           if (!dr_b)
 620             return false;
 621
 622           bool dependent = false;
 623           /* If we run into a store of this same instance (we've just
 624              marked those) then delay dependence checking until we run
 625              into the last store because this is where it will have
 626              been sunk to (and we verify if we can do that as well).  */
 627           if (gimple_visited_p (stmt))
 628             {
 629               if (stmt != last_store)
 630                 continue;
 631               unsigned i;
 632               gimple *store;
 633               FOR_EACH_VEC_ELT (stores, i, store)
 634                 {
 635                   data_reference *store_dr
 636                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 637                   ddr_p ddr = initialize_data_dependence_relation
 638                                 (dr_a, store_dr, vNULL);
 639                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 640                   free_dependence_relation (ddr);
 641                   if (dependent)
 642                     break;
 643                 }
 644             }
 645           else
 646             {
 647               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 648                                                                dr_b, vNULL);
 649               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 650               free_dependence_relation (ddr);
 651             }
 652           if (dependent)
 653             return false;
 654         }
 655     }
 656   return true;
 657 }
 658
 659
 660 /* Function vect_analyze_data_ref_dependences.
 661
 662    Examine all the data references in the basic-block, and make sure there
 663    do not exist any data dependences between them.  Set *MAX_VF according to
 664    the maximum vectorization factor the data dependences allow.  */
 665
 666 bool
 667 vect_slp_analyze_instance_dependence (slp_instance instance)
 668 {
 669   if (dump_enabled_p ())
 670     dump_printf_loc (MSG_NOTE, vect_location,
 671                      "=== vect_slp_analyze_instance_dependence ===\n");
 672
 673   /* The stores of this instance are at the root of the SLP tree.  */
 674   slp_tree store = SLP_INSTANCE_TREE (instance);
 675   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 676     store = NULL;
 677
 678   /* Verify we can sink stores to the vectorized stmt insert location.  */
 679   gimple *last_store = NULL;
 680   if (store)
 681     {
 682       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 683         return false;
 684
 685       /* Mark stores in this instance and remember the last one.  */
 686       last_store = vect_find_last_scalar_stmt_in_slp (store);
 687       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 688         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 689     }
 690
 691   bool res = true;
 692
 693   /* Verify we can sink loads to the vectorized stmt insert location,
 694      special-casing stores of this instance.  */
 695   slp_tree load;
 696   unsigned int i;
 697   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 698     if (! vect_slp_analyze_node_dependences (instance, load,
 699                                              store
 700                                              ? SLP_TREE_SCALAR_STMTS (store)
 701                                              : vNULL, last_store))
 702       {
 703         res = false;
 704         break;
 705       }
 706
 707   /* Unset the visited flag.  */
 708   if (store)
 709     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 710       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 711
 712   return res;
 713 }
 714
 715 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 716    the statement that contains DRB, which is useful for recording in the
 717    dump file.  */
 718
 719 static void
 720 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 721                             innermost_loop_behavior *drb)
 722 {
 723   bool existed;
 724   innermost_loop_behavior *&entry
 725     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 726   if (!existed || entry->base_alignment < drb->base_alignment)
 727     {
 728       entry = drb;
 729       if (dump_enabled_p ())
 730         {
 731           dump_printf_loc (MSG_NOTE, vect_location,
 732                            "recording new base alignment for ");
 733           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 734           dump_printf (MSG_NOTE, "\n");
 735           dump_printf_loc (MSG_NOTE, vect_location,
 736                            "  alignment:    %d\n", drb->base_alignment);
 737           dump_printf_loc (MSG_NOTE, vect_location,
 738                            "  misalignment: %d\n", drb->base_misalignment);
 739           dump_printf_loc (MSG_NOTE, vect_location,
 740                            "  based on:     ");
 741           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 742         }
 743     }
 744 }
 745
 746 /* If the region we're going to vectorize is reached, all unconditional
 747    data references occur at least once.  We can therefore pool the base
 748    alignment guarantees from each unconditional reference.  Do this by
 749    going through all the data references in VINFO and checking whether
 750    the containing statement makes the reference unconditionally.  If so,
 751    record the alignment of the base address in VINFO so that it can be
 752    used for all other references with the same base.  */
 753
 754 void
 755 vect_record_base_alignments (vec_info *vinfo)
 756 {
 757   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 758   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 759   data_reference *dr;
 760   unsigned int i;
 761   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 762     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 763       {
 764         gimple *stmt = DR_STMT (dr);
 765         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 766
 767         /* If DR is nested in the loop that is being vectorized, we can also
 768            record the alignment of the base wrt the outer loop.  */
 769         if (loop && nested_in_vect_loop_p (loop, stmt))
 770           {
 771             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 772             vect_record_base_alignment
 773               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 774           }
 775       }
 776 }
 777
 778 /* Return the target alignment for the vectorized form of DR.  */
 779
 780 static unsigned int
 781 vect_calculate_target_alignment (struct data_reference *dr)
 782 {
 783   gimple *stmt = DR_STMT (dr);
 784   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 785   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 786   return targetm.vectorize.preferred_vector_alignment (vectype);
 787 }
 788
 789 /* Function vect_compute_data_ref_alignment
 790
 791    Compute the misalignment of the data reference DR.
 792
 793    Output:
 794    1. If during the misalignment computation it is found that the data reference
 795       cannot be vectorized then false is returned.
 796    2. DR_MISALIGNMENT (DR) is defined.
 797
 798    FOR NOW: No analysis is actually performed. Misalignment is calculated
 799    only for trivial cases. TODO.  */
 800
 801 bool
 802 vect_compute_data_ref_alignment (struct data_reference *dr)
 803 {
 804   gimple *stmt = DR_STMT (dr);
 805   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 806   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 807   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 808   struct loop *loop = NULL;
 809   tree ref = DR_REF (dr);
 810   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 811
 812   if (dump_enabled_p ())
 813     dump_printf_loc (MSG_NOTE, vect_location,
 814                      "vect_compute_data_ref_alignment:\n");
 815
 816   if (loop_vinfo)
 817     loop = LOOP_VINFO_LOOP (loop_vinfo);
 818
 819   /* Initialize misalignment to unknown.  */
 820   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 821
 822   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 823   bool step_preserves_misalignment_p;
 824
 825   unsigned HOST_WIDE_INT vector_alignment
 826     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 827   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 828
 829   /* No step for BB vectorization.  */
 830   if (!loop)
 831     {
 832       gcc_assert (integer_zerop (drb->step));
 833       step_preserves_misalignment_p = true;
 834     }
 835
 836   /* In case the dataref is in an inner-loop of the loop that is being
 837      vectorized (LOOP), we use the base and misalignment information
 838      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 839      stays the same throughout the execution of the inner-loop, which is why
 840      we have to check that the stride of the dataref in the inner-loop evenly
 841      divides by the vector alignment.  */
 842   else if (nested_in_vect_loop_p (loop, stmt))
 843     {
 844       step_preserves_misalignment_p
 845         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 846
 847       if (dump_enabled_p ())
 848         {
 849           if (step_preserves_misalignment_p)
 850             dump_printf_loc (MSG_NOTE, vect_location,
 851                              "inner step divides the vector alignment.\n");
 852           else
 853             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 854                              "inner step doesn't divide the vector"
 855                              " alignment.\n");
 856         }
 857     }
 858
 859   /* Similarly we can only use base and misalignment information relative to
 860      an innermost loop if the misalignment stays the same throughout the
 861      execution of the loop.  As above, this is the case if the stride of
 862      the dataref evenly divides by the alignment.  */
 863   else
 864     {
 865       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 866       step_preserves_misalignment_p
 867         = ((DR_STEP_ALIGNMENT (dr) * vf) % vector_alignment) == 0;
 868
 869       if (!step_preserves_misalignment_p && dump_enabled_p ())
 870         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 871                          "step doesn't divide the vector alignment.\n");
 872     }
 873
 874   unsigned int base_alignment = drb->base_alignment;
 875   unsigned int base_misalignment = drb->base_misalignment;
 876
 877   /* Calculate the maximum of the pooled base address alignment and the
 878      alignment that we can compute for DR itself.  */
 879   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 880   if (entry && base_alignment < (*entry)->base_alignment)
 881     {
 882       base_alignment = (*entry)->base_alignment;
 883       base_misalignment = (*entry)->base_misalignment;
 884     }
 885
 886   if (drb->offset_alignment < vector_alignment
 887       || !step_preserves_misalignment_p
 888       /* We need to know whether the step wrt the vectorized loop is
 889          negative when computing the starting misalignment below.  */
 890       || TREE_CODE (drb->step) != INTEGER_CST)
 891     {
 892       if (dump_enabled_p ())
 893         {
 894           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 895                            "Unknown alignment for access: ");
 896           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 897           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 898         }
 899       return true;
 900     }
 901
 902   if (base_alignment < vector_alignment)
 903     {
 904       tree base = drb->base_address;
 905       if (TREE_CODE (base) == ADDR_EXPR)
 906         base = TREE_OPERAND (base, 0);
 907       if (!vect_can_force_dr_alignment_p (base,
 908                                           vector_alignment * BITS_PER_UNIT))
 909         {
 910           if (dump_enabled_p ())
 911             {
 912               dump_printf_loc (MSG_NOTE, vect_location,
 913                                "can't force alignment of ref: ");
 914               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 915               dump_printf (MSG_NOTE, "\n");
 916             }
 917           return true;
 918         }
 919
 920       if (DECL_USER_ALIGN (base))
 921         {
 922           if (dump_enabled_p ())
 923             {
 924               dump_printf_loc (MSG_NOTE, vect_location,
 925                                "not forcing alignment of user-aligned "
 926                                "variable: ");
 927               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 928               dump_printf (MSG_NOTE, "\n");
 929             }
 930           return true;
 931         }
 932
 933       /* Force the alignment of the decl.
 934          NOTE: This is the only change to the code we make during
 935          the analysis phase, before deciding to vectorize the loop.  */
 936       if (dump_enabled_p ())
 937         {
 938           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 939           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 940           dump_printf (MSG_NOTE, "\n");
 941         }
 942
 943       DR_VECT_AUX (dr)->base_decl = base;
 944       DR_VECT_AUX (dr)->base_misaligned = true;
 945       base_misalignment = 0;
 946     }
 947   poly_int64 misalignment
 948     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 949
 950   /* If this is a backward running DR then first access in the larger
 951      vectype actually is N-1 elements before the address in the DR.
 952      Adjust misalign accordingly.  */
 953   if (tree_int_cst_sgn (drb->step) < 0)
 954     /* PLUS because STEP is negative.  */
 955     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 956                      * TREE_INT_CST_LOW (drb->step));
 957
 958   unsigned int const_misalignment;
 959   if (!known_misalignment (misalignment, vector_alignment,
 960                            &const_misalignment))
 961     {
 962       if (dump_enabled_p ())
 963         {
 964           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 965                            "Non-constant misalignment for access: ");
 966           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 967           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 968         }
 969       return true;
 970     }
 971
 972   SET_DR_MISALIGNMENT (dr, const_misalignment);
 973
 974   if (dump_enabled_p ())
 975     {
 976       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 977                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 978       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 979       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 980     }
 981
 982   return true;
 983 }
 984
 985 /* Function vect_update_misalignment_for_peel.
 986    Sets DR's misalignment
 987    - to 0 if it has the same alignment as DR_PEEL,
 988    - to the misalignment computed using NPEEL if DR's salignment is known,
 989    - to -1 (unknown) otherwise.
 990
 991    DR - the data reference whose misalignment is to be adjusted.
 992    DR_PEEL - the data reference whose misalignment is being made
 993              zero in the vector loop by the peel.
 994    NPEEL - the number of iterations in the peel loop if the misalignment
 995            of DR_PEEL is known at compile time.  */
 996
 997 static void
 998 vect_update_misalignment_for_peel (struct data_reference *dr,
 999                                    struct data_reference *dr_peel, int npeel)
1000 {
1001   unsigned int i;
1002   vec<dr_p> same_aligned_drs;
1003   struct data_reference *current_dr;
1004   int dr_size = vect_get_scalar_dr_size (dr);
1005   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1006   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1007   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1008
1009  /* For interleaved data accesses the step in the loop must be multiplied by
1010      the size of the interleaving group.  */
1011   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1012     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1013   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1014     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1015
1016   /* It can be assumed that the data refs with the same alignment as dr_peel
1017      are aligned in the vector loop.  */
1018   same_aligned_drs
1019     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1020   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1021     {
1022       if (current_dr != dr)
1023         continue;
1024       gcc_assert (!known_alignment_for_access_p (dr)
1025                   || !known_alignment_for_access_p (dr_peel)
1026                   || (DR_MISALIGNMENT (dr) / dr_size
1027                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1028       SET_DR_MISALIGNMENT (dr, 0);
1029       return;
1030     }
1031
1032   if (known_alignment_for_access_p (dr)
1033       && known_alignment_for_access_p (dr_peel))
1034     {
1035       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1036       int misal = DR_MISALIGNMENT (dr);
1037       misal += negative ? -npeel * dr_size : npeel * dr_size;
1038       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1039       SET_DR_MISALIGNMENT (dr, misal);
1040       return;
1041     }
1042
1043   if (dump_enabled_p ())
1044     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1045                      "to unknown (-1).\n");
1046   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1047 }
1048
1049
1050 /* Function verify_data_ref_alignment
1051
1052    Return TRUE if DR can be handled with respect to alignment.  */
1053
1054 static bool
1055 verify_data_ref_alignment (data_reference_p dr)
1056 {
1057   enum dr_alignment_support supportable_dr_alignment
1058     = vect_supportable_dr_alignment (dr, false);
1059   if (!supportable_dr_alignment)
1060     {
1061       if (dump_enabled_p ())
1062         {
1063           if (DR_IS_READ (dr))
1064             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1065                              "not vectorized: unsupported unaligned load.");
1066           else
1067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068                              "not vectorized: unsupported unaligned "
1069                              "store.");
1070
1071           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1072                              DR_REF (dr));
1073           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1074         }
1075       return false;
1076     }
1077
1078   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1079     dump_printf_loc (MSG_NOTE, vect_location,
1080                      "Vectorizing an unaligned access.\n");
1081
1082   return true;
1083 }
1084
1085 /* Function vect_verify_datarefs_alignment
1086
1087    Return TRUE if all data references in the loop can be
1088    handled with respect to alignment.  */
1089
1090 bool
1091 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1092 {
1093   vec<data_reference_p> datarefs = vinfo->datarefs;
1094   struct data_reference *dr;
1095   unsigned int i;
1096
1097   FOR_EACH_VEC_ELT (datarefs, i, dr)
1098     {
1099       gimple *stmt = DR_STMT (dr);
1100       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1101
1102       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1103         continue;
1104
1105       /* For interleaving, only the alignment of the first access matters.   */
1106       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1107           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1108         continue;
1109
1110       /* Strided accesses perform only component accesses, alignment is
1111          irrelevant for them.  */
1112       if (STMT_VINFO_STRIDED_P (stmt_info)
1113           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1114         continue;
1115
1116       if (! verify_data_ref_alignment (dr))
1117         return false;
1118     }
1119
1120   return true;
1121 }
1122
1123 /* Given an memory reference EXP return whether its alignment is less
1124    than its size.  */
1125
1126 static bool
1127 not_size_aligned (tree exp)
1128 {
1129   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1130     return true;
1131
1132   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1133           > get_object_alignment (exp));
1134 }
1135
1136 /* Function vector_alignment_reachable_p
1137
1138    Return true if vector alignment for DR is reachable by peeling
1139    a few loop iterations.  Return false otherwise.  */
1140
1141 static bool
1142 vector_alignment_reachable_p (struct data_reference *dr)
1143 {
1144   gimple *stmt = DR_STMT (dr);
1145   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1146   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1147
1148   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1149     {
1150       /* For interleaved access we peel only if number of iterations in
1151          the prolog loop ({VF - misalignment}), is a multiple of the
1152          number of the interleaved accesses.  */
1153       int elem_size, mis_in_elements;
1154       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1155
1156       /* FORNOW: handle only known alignment.  */
1157       if (!known_alignment_for_access_p (dr))
1158         return false;
1159
1160       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1161       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1162
1163       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1164         return false;
1165     }
1166
1167   /* If misalignment is known at the compile time then allow peeling
1168      only if natural alignment is reachable through peeling.  */
1169   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1170     {
1171       HOST_WIDE_INT elmsize =
1172                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1173       if (dump_enabled_p ())
1174         {
1175           dump_printf_loc (MSG_NOTE, vect_location,
1176                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1177           dump_printf (MSG_NOTE,
1178                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1179         }
1180       if (DR_MISALIGNMENT (dr) % elmsize)
1181         {
1182           if (dump_enabled_p ())
1183             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184                              "data size does not divide the misalignment.\n");
1185           return false;
1186         }
1187     }
1188
1189   if (!known_alignment_for_access_p (dr))
1190     {
1191       tree type = TREE_TYPE (DR_REF (dr));
1192       bool is_packed = not_size_aligned (DR_REF (dr));
1193       if (dump_enabled_p ())
1194         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1195                          "Unknown misalignment, %snaturally aligned\n",
1196                          is_packed ? "not " : "");
1197       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1198     }
1199
1200   return true;
1201 }
1202
1203
1204 /* Calculate the cost of the memory access represented by DR.  */
1205
1206 static void
1207 vect_get_data_access_cost (struct data_reference *dr,
1208                            unsigned int *inside_cost,
1209                            unsigned int *outside_cost,
1210                            stmt_vector_for_cost *body_cost_vec)
1211 {
1212   gimple *stmt = DR_STMT (dr);
1213   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1214   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1215   int ncopies;
1216
1217   if (PURE_SLP_STMT (stmt_info))
1218     ncopies = 1;
1219   else
1220     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1221
1222   if (DR_IS_READ (dr))
1223     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1224                         NULL, body_cost_vec, false);
1225   else
1226     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1227
1228   if (dump_enabled_p ())
1229     dump_printf_loc (MSG_NOTE, vect_location,
1230                      "vect_get_data_access_cost: inside_cost = %d, "
1231                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1232 }
1233
1234
1235 typedef struct _vect_peel_info
1236 {
1237   struct data_reference *dr;
1238   int npeel;
1239   unsigned int count;
1240 } *vect_peel_info;
1241
1242 typedef struct _vect_peel_extended_info
1243 {
1244   struct _vect_peel_info peel_info;
1245   unsigned int inside_cost;
1246   unsigned int outside_cost;
1247 } *vect_peel_extended_info;
1248
1249
1250 /* Peeling hashtable helpers.  */
1251
1252 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1253 {
1254   static inline hashval_t hash (const _vect_peel_info *);
1255   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1256 };
1257
1258 inline hashval_t
1259 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1260 {
1261   return (hashval_t) peel_info->npeel;
1262 }
1263
1264 inline bool
1265 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1266 {
1267   return (a->npeel == b->npeel);
1268 }
1269
1270
1271 /* Insert DR into peeling hash table with NPEEL as key.  */
1272
1273 static void
1274 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1275                           loop_vec_info loop_vinfo, struct data_reference *dr,
1276                           int npeel)
1277 {
1278   struct _vect_peel_info elem, *slot;
1279   _vect_peel_info **new_slot;
1280   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1281
1282   elem.npeel = npeel;
1283   slot = peeling_htab->find (&elem);
1284   if (slot)
1285     slot->count++;
1286   else
1287     {
1288       slot = XNEW (struct _vect_peel_info);
1289       slot->npeel = npeel;
1290       slot->dr = dr;
1291       slot->count = 1;
1292       new_slot = peeling_htab->find_slot (slot, INSERT);
1293       *new_slot = slot;
1294     }
1295
1296   if (!supportable_dr_alignment
1297       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1298     slot->count += VECT_MAX_COST;
1299 }
1300
1301
1302 /* Traverse peeling hash table to find peeling option that aligns maximum
1303    number of data accesses.  */
1304
1305 int
1306 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1307                                      _vect_peel_extended_info *max)
1308 {
1309   vect_peel_info elem = *slot;
1310
1311   if (elem->count > max->peel_info.count
1312       || (elem->count == max->peel_info.count
1313           && max->peel_info.npeel > elem->npeel))
1314     {
1315       max->peel_info.npeel = elem->npeel;
1316       max->peel_info.count = elem->count;
1317       max->peel_info.dr = elem->dr;
1318     }
1319
1320   return 1;
1321 }
1322
1323 /* Get the costs of peeling NPEEL iterations checking data access costs
1324    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1325    misalignment will be zero after peeling.  */
1326
1327 static void
1328 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1329                                 struct data_reference *dr0,
1330                                 unsigned int *inside_cost,
1331                                 unsigned int *outside_cost,
1332                                 stmt_vector_for_cost *body_cost_vec,
1333                                 unsigned int npeel,
1334                                 bool unknown_misalignment)
1335 {
1336   unsigned i;
1337   data_reference *dr;
1338
1339   FOR_EACH_VEC_ELT (datarefs, i, dr)
1340     {
1341       gimple *stmt = DR_STMT (dr);
1342       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1343       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1344         continue;
1345
1346       /* For interleaving, only the alignment of the first access
1347          matters.  */
1348       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1349           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1350         continue;
1351
1352       /* Strided accesses perform only component accesses, alignment is
1353          irrelevant for them.  */
1354       if (STMT_VINFO_STRIDED_P (stmt_info)
1355           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1356         continue;
1357
1358       int save_misalignment;
1359       save_misalignment = DR_MISALIGNMENT (dr);
1360       if (npeel == 0)
1361         ;
1362       else if (unknown_misalignment && dr == dr0)
1363         SET_DR_MISALIGNMENT (dr, 0);
1364       else
1365         vect_update_misalignment_for_peel (dr, dr0, npeel);
1366       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1367                                  body_cost_vec);
1368       SET_DR_MISALIGNMENT (dr, save_misalignment);
1369     }
1370 }
1371
1372 /* Traverse peeling hash table and calculate cost for each peeling option.
1373    Find the one with the lowest cost.  */
1374
1375 int
1376 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1377                                    _vect_peel_extended_info *min)
1378 {
1379   vect_peel_info elem = *slot;
1380   int dummy;
1381   unsigned int inside_cost = 0, outside_cost = 0;
1382   gimple *stmt = DR_STMT (elem->dr);
1383   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1384   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1385   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1386                        epilogue_cost_vec;
1387
1388   prologue_cost_vec.create (2);
1389   body_cost_vec.create (2);
1390   epilogue_cost_vec.create (2);
1391
1392   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1393                                   elem->dr, &inside_cost, &outside_cost,
1394                                   &body_cost_vec, elem->npeel, false);
1395
1396   body_cost_vec.release ();
1397
1398   outside_cost += vect_get_known_peeling_cost
1399     (loop_vinfo, elem->npeel, &dummy,
1400      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1401      &prologue_cost_vec, &epilogue_cost_vec);
1402
1403   /* Prologue and epilogue costs are added to the target model later.
1404      These costs depend only on the scalar iteration cost, the
1405      number of peeling iterations finally chosen, and the number of
1406      misaligned statements.  So discard the information found here.  */
1407   prologue_cost_vec.release ();
1408   epilogue_cost_vec.release ();
1409
1410   if (inside_cost < min->inside_cost
1411       || (inside_cost == min->inside_cost
1412           && outside_cost < min->outside_cost))
1413     {
1414       min->inside_cost = inside_cost;
1415       min->outside_cost = outside_cost;
1416       min->peel_info.dr = elem->dr;
1417       min->peel_info.npeel = elem->npeel;
1418       min->peel_info.count = elem->count;
1419     }
1420
1421   return 1;
1422 }
1423
1424
1425 /* Choose best peeling option by traversing peeling hash table and either
1426    choosing an option with the lowest cost (if cost model is enabled) or the
1427    option that aligns as many accesses as possible.  */
1428
1429 static struct _vect_peel_extended_info
1430 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1431                                        loop_vec_info loop_vinfo)
1432 {
1433    struct _vect_peel_extended_info res;
1434
1435    res.peel_info.dr = NULL;
1436
1437    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1438      {
1439        res.inside_cost = INT_MAX;
1440        res.outside_cost = INT_MAX;
1441        peeling_htab->traverse <_vect_peel_extended_info *,
1442                                vect_peeling_hash_get_lowest_cost> (&res);
1443      }
1444    else
1445      {
1446        res.peel_info.count = 0;
1447        peeling_htab->traverse <_vect_peel_extended_info *,
1448                                vect_peeling_hash_get_most_frequent> (&res);
1449        res.inside_cost = 0;
1450        res.outside_cost = 0;
1451      }
1452
1453    return res;
1454 }
1455
1456 /* Return true if the new peeling NPEEL is supported.  */
1457
1458 static bool
1459 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1460                           unsigned npeel)
1461 {
1462   unsigned i;
1463   struct data_reference *dr = NULL;
1464   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1465   gimple *stmt;
1466   stmt_vec_info stmt_info;
1467   enum dr_alignment_support supportable_dr_alignment;
1468
1469   /* Ensure that all data refs can be vectorized after the peel.  */
1470   FOR_EACH_VEC_ELT (datarefs, i, dr)
1471     {
1472       int save_misalignment;
1473
1474       if (dr == dr0)
1475         continue;
1476
1477       stmt = DR_STMT (dr);
1478       stmt_info = vinfo_for_stmt (stmt);
1479       /* For interleaving, only the alignment of the first access
1480          matters.  */
1481       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1482           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1483         continue;
1484
1485       /* Strided accesses perform only component accesses, alignment is
1486          irrelevant for them.  */
1487       if (STMT_VINFO_STRIDED_P (stmt_info)
1488           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1489         continue;
1490
1491       save_misalignment = DR_MISALIGNMENT (dr);
1492       vect_update_misalignment_for_peel (dr, dr0, npeel);
1493       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1494       SET_DR_MISALIGNMENT (dr, save_misalignment);
1495
1496       if (!supportable_dr_alignment)
1497         return false;
1498     }
1499
1500   return true;
1501 }
1502
1503 /* Function vect_enhance_data_refs_alignment
1504
1505    This pass will use loop versioning and loop peeling in order to enhance
1506    the alignment of data references in the loop.
1507
1508    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1509    original loop is to be vectorized.  Any other loops that are created by
1510    the transformations performed in this pass - are not supposed to be
1511    vectorized.  This restriction will be relaxed.
1512
1513    This pass will require a cost model to guide it whether to apply peeling
1514    or versioning or a combination of the two.  For example, the scheme that
1515    intel uses when given a loop with several memory accesses, is as follows:
1516    choose one memory access ('p') which alignment you want to force by doing
1517    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1518    other accesses are not necessarily aligned, or (2) use loop versioning to
1519    generate one loop in which all accesses are aligned, and another loop in
1520    which only 'p' is necessarily aligned.
1521
1522    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1523    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1524    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1525
1526    Devising a cost model is the most critical aspect of this work.  It will
1527    guide us on which access to peel for, whether to use loop versioning, how
1528    many versions to create, etc.  The cost model will probably consist of
1529    generic considerations as well as target specific considerations (on
1530    powerpc for example, misaligned stores are more painful than misaligned
1531    loads).
1532
1533    Here are the general steps involved in alignment enhancements:
1534
1535      -- original loop, before alignment analysis:
1536         for (i=0; i<N; i++){
1537           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1538           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1539         }
1540
1541      -- After vect_compute_data_refs_alignment:
1542         for (i=0; i<N; i++){
1543           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1544           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1545         }
1546
1547      -- Possibility 1: we do loop versioning:
1548      if (p is aligned) {
1549         for (i=0; i<N; i++){    # loop 1A
1550           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1551           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1552         }
1553      }
1554      else {
1555         for (i=0; i<N; i++){    # loop 1B
1556           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1557           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1558         }
1559      }
1560
1561      -- Possibility 2: we do loop peeling:
1562      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1563         x = q[i];
1564         p[i] = y;
1565      }
1566      for (i = 3; i < N; i++){   # loop 2A
1567         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1568         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1569      }
1570
1571      -- Possibility 3: combination of loop peeling and versioning:
1572      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1573         x = q[i];
1574         p[i] = y;
1575      }
1576      if (p is aligned) {
1577         for (i = 3; i<N; i++){  # loop 3A
1578           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1579           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1580         }
1581      }
1582      else {
1583         for (i = 3; i<N; i++){  # loop 3B
1584           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1585           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1586         }
1587      }
1588
1589      These loops are later passed to loop_transform to be vectorized.  The
1590      vectorizer will use the alignment information to guide the transformation
1591      (whether to generate regular loads/stores, or with special handling for
1592      misalignment).  */
1593
1594 bool
1595 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1596 {
1597   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1598   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1599   enum dr_alignment_support supportable_dr_alignment;
1600   struct data_reference *dr0 = NULL, *first_store = NULL;
1601   struct data_reference *dr;
1602   unsigned int i, j;
1603   bool do_peeling = false;
1604   bool do_versioning = false;
1605   bool stat;
1606   gimple *stmt;
1607   stmt_vec_info stmt_info;
1608   unsigned int npeel = 0;
1609   bool one_misalignment_known = false;
1610   bool one_misalignment_unknown = false;
1611   bool one_dr_unsupportable = false;
1612   struct data_reference *unsupportable_dr = NULL;
1613   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1614   unsigned possible_npeel_number = 1;
1615   tree vectype;
1616   unsigned int nelements, mis, same_align_drs_max = 0;
1617   hash_table<peel_info_hasher> peeling_htab (1);
1618
1619   if (dump_enabled_p ())
1620     dump_printf_loc (MSG_NOTE, vect_location,
1621                      "=== vect_enhance_data_refs_alignment ===\n");
1622
1623   /* Reset data so we can safely be called multiple times.  */
1624   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1625   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1626
1627   /* While cost model enhancements are expected in the future, the high level
1628      view of the code at this time is as follows:
1629
1630      A) If there is a misaligned access then see if peeling to align
1631         this access can make all data references satisfy
1632         vect_supportable_dr_alignment.  If so, update data structures
1633         as needed and return true.
1634
1635      B) If peeling wasn't possible and there is a data reference with an
1636         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1637         then see if loop versioning checks can be used to make all data
1638         references satisfy vect_supportable_dr_alignment.  If so, update
1639         data structures as needed and return true.
1640
1641      C) If neither peeling nor versioning were successful then return false if
1642         any data reference does not satisfy vect_supportable_dr_alignment.
1643
1644      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1645
1646      Note, Possibility 3 above (which is peeling and versioning together) is not
1647      being done at this time.  */
1648
1649   /* (1) Peeling to force alignment.  */
1650
1651   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1652      Considerations:
1653      + How many accesses will become aligned due to the peeling
1654      - How many accesses will become unaligned due to the peeling,
1655        and the cost of misaligned accesses.
1656      - The cost of peeling (the extra runtime checks, the increase
1657        in code size).  */
1658
1659   FOR_EACH_VEC_ELT (datarefs, i, dr)
1660     {
1661       stmt = DR_STMT (dr);
1662       stmt_info = vinfo_for_stmt (stmt);
1663
1664       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1665         continue;
1666
1667       /* For interleaving, only the alignment of the first access
1668          matters.  */
1669       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1670           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1671         continue;
1672
1673       /* For invariant accesses there is nothing to enhance.  */
1674       if (integer_zerop (DR_STEP (dr)))
1675         continue;
1676
1677       /* Strided accesses perform only component accesses, alignment is
1678          irrelevant for them.  */
1679       if (STMT_VINFO_STRIDED_P (stmt_info)
1680           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1681         continue;
1682
1683       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1684       do_peeling = vector_alignment_reachable_p (dr);
1685       if (do_peeling)
1686         {
1687           if (known_alignment_for_access_p (dr))
1688             {
1689               unsigned int npeel_tmp = 0;
1690               bool negative = tree_int_cst_compare (DR_STEP (dr),
1691                                                     size_zero_node) < 0;
1692
1693               vectype = STMT_VINFO_VECTYPE (stmt_info);
1694               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1695               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1696               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1697               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1698               if (DR_MISALIGNMENT (dr) != 0)
1699                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1700
1701               /* For multiple types, it is possible that the bigger type access
1702                  will have more than one peeling option.  E.g., a loop with two
1703                  types: one of size (vector size / 4), and the other one of
1704                  size (vector size / 8).  Vectorization factor will 8.  If both
1705                  accesses are misaligned by 3, the first one needs one scalar
1706                  iteration to be aligned, and the second one needs 5.  But the
1707                  first one will be aligned also by peeling 5 scalar
1708                  iterations, and in that case both accesses will be aligned.
1709                  Hence, except for the immediate peeling amount, we also want
1710                  to try to add full vector size, while we don't exceed
1711                  vectorization factor.
1712                  We do this automatically for cost model, since we calculate
1713                  cost for every peeling option.  */
1714               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1715                 {
1716                   if (STMT_SLP_TYPE (stmt_info))
1717                     possible_npeel_number
1718                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1719                   else
1720                     possible_npeel_number = vf / nelements;
1721
1722                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1723                      allow peeling NELEMENTS.  */
1724                   if (DR_MISALIGNMENT (dr) == 0)
1725                     possible_npeel_number++;
1726                 }
1727
1728               /* Save info about DR in the hash table.  Also include peeling
1729                  amounts according to the explanation above.  */
1730               for (j = 0; j < possible_npeel_number; j++)
1731                 {
1732                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1733                                             dr, npeel_tmp);
1734                   npeel_tmp += target_align / dr_size;
1735                 }
1736
1737               one_misalignment_known = true;
1738             }
1739           else
1740             {
1741               /* If we don't know any misalignment values, we prefer
1742                  peeling for data-ref that has the maximum number of data-refs
1743                  with the same alignment, unless the target prefers to align
1744                  stores over load.  */
1745               unsigned same_align_drs
1746                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1747               if (!dr0
1748                   || same_align_drs_max < same_align_drs)
1749                 {
1750                   same_align_drs_max = same_align_drs;
1751                   dr0 = dr;
1752                 }
1753               /* For data-refs with the same number of related
1754                  accesses prefer the one where the misalign
1755                  computation will be invariant in the outermost loop.  */
1756               else if (same_align_drs_max == same_align_drs)
1757                 {
1758                   struct loop *ivloop0, *ivloop;
1759                   ivloop0 = outermost_invariant_loop_for_expr
1760                     (loop, DR_BASE_ADDRESS (dr0));
1761                   ivloop = outermost_invariant_loop_for_expr
1762                     (loop, DR_BASE_ADDRESS (dr));
1763                   if ((ivloop && !ivloop0)
1764                       || (ivloop && ivloop0
1765                           && flow_loop_nested_p (ivloop, ivloop0)))
1766                     dr0 = dr;
1767                 }
1768
1769               one_misalignment_unknown = true;
1770
1771               /* Check for data refs with unsupportable alignment that
1772                  can be peeled.  */
1773               if (!supportable_dr_alignment)
1774               {
1775                 one_dr_unsupportable = true;
1776                 unsupportable_dr = dr;
1777               }
1778
1779               if (!first_store && DR_IS_WRITE (dr))
1780                 first_store = dr;
1781             }
1782         }
1783       else
1784         {
1785           if (!aligned_access_p (dr))
1786             {
1787               if (dump_enabled_p ())
1788                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1789                                  "vector alignment may not be reachable\n");
1790               break;
1791             }
1792         }
1793     }
1794
1795   /* Check if we can possibly peel the loop.  */
1796   if (!vect_can_advance_ivs_p (loop_vinfo)
1797       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1798       || loop->inner)
1799     do_peeling = false;
1800
1801   struct _vect_peel_extended_info peel_for_known_alignment;
1802   struct _vect_peel_extended_info peel_for_unknown_alignment;
1803   struct _vect_peel_extended_info best_peel;
1804
1805   peel_for_unknown_alignment.inside_cost = INT_MAX;
1806   peel_for_unknown_alignment.outside_cost = INT_MAX;
1807   peel_for_unknown_alignment.peel_info.count = 0;
1808
1809   if (do_peeling
1810       && one_misalignment_unknown)
1811     {
1812       /* Check if the target requires to prefer stores over loads, i.e., if
1813          misaligned stores are more expensive than misaligned loads (taking
1814          drs with same alignment into account).  */
1815       unsigned int load_inside_cost = 0;
1816       unsigned int load_outside_cost = 0;
1817       unsigned int store_inside_cost = 0;
1818       unsigned int store_outside_cost = 0;
1819
1820       stmt_vector_for_cost dummy;
1821       dummy.create (2);
1822       vect_get_peeling_costs_all_drs (datarefs, dr0,
1823                                       &load_inside_cost,
1824                                       &load_outside_cost,
1825                                       &dummy, vf / 2, true);
1826       dummy.release ();
1827
1828       if (first_store)
1829         {
1830           dummy.create (2);
1831           vect_get_peeling_costs_all_drs (datarefs, first_store,
1832                                           &store_inside_cost,
1833                                           &store_outside_cost,
1834                                           &dummy, vf / 2, true);
1835           dummy.release ();
1836         }
1837       else
1838         {
1839           store_inside_cost = INT_MAX;
1840           store_outside_cost = INT_MAX;
1841         }
1842
1843       if (load_inside_cost > store_inside_cost
1844           || (load_inside_cost == store_inside_cost
1845               && load_outside_cost > store_outside_cost))
1846         {
1847           dr0 = first_store;
1848           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1849           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1850         }
1851       else
1852         {
1853           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1854           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1855         }
1856
1857       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1858       prologue_cost_vec.create (2);
1859       epilogue_cost_vec.create (2);
1860
1861       int dummy2;
1862       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1863         (loop_vinfo, vf / 2, &dummy2,
1864          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1865          &prologue_cost_vec, &epilogue_cost_vec);
1866
1867       prologue_cost_vec.release ();
1868       epilogue_cost_vec.release ();
1869
1870       peel_for_unknown_alignment.peel_info.count = 1
1871         + STMT_VINFO_SAME_ALIGN_REFS
1872         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1873     }
1874
1875   peel_for_unknown_alignment.peel_info.npeel = 0;
1876   peel_for_unknown_alignment.peel_info.dr = dr0;
1877
1878   best_peel = peel_for_unknown_alignment;
1879
1880   peel_for_known_alignment.inside_cost = INT_MAX;
1881   peel_for_known_alignment.outside_cost = INT_MAX;
1882   peel_for_known_alignment.peel_info.count = 0;
1883   peel_for_known_alignment.peel_info.dr = NULL;
1884
1885   if (do_peeling && one_misalignment_known)
1886     {
1887       /* Peeling is possible, but there is no data access that is not supported
1888          unless aligned.  So we try to choose the best possible peeling from
1889          the hash table.  */
1890       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1891         (&peeling_htab, loop_vinfo);
1892     }
1893
1894   /* Compare costs of peeling for known and unknown alignment. */
1895   if (peel_for_known_alignment.peel_info.dr != NULL
1896       && peel_for_unknown_alignment.inside_cost
1897       >= peel_for_known_alignment.inside_cost)
1898     {
1899       best_peel = peel_for_known_alignment;
1900
1901       /* If the best peeling for known alignment has NPEEL == 0, perform no
1902          peeling at all except if there is an unsupportable dr that we can
1903          align.  */
1904       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1905         do_peeling = false;
1906     }
1907
1908   /* If there is an unsupportable data ref, prefer this over all choices so far
1909      since we'd have to discard a chosen peeling except when it accidentally
1910      aligned the unsupportable data ref.  */
1911   if (one_dr_unsupportable)
1912     dr0 = unsupportable_dr;
1913   else if (do_peeling)
1914     {
1915       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1916          TODO: Use nopeel_outside_cost or get rid of it?  */
1917       unsigned nopeel_inside_cost = 0;
1918       unsigned nopeel_outside_cost = 0;
1919
1920       stmt_vector_for_cost dummy;
1921       dummy.create (2);
1922       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1923                                       &nopeel_outside_cost, &dummy, 0, false);
1924       dummy.release ();
1925
1926       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1927          costs will be recorded.  */
1928       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1929       prologue_cost_vec.create (2);
1930       epilogue_cost_vec.create (2);
1931
1932       int dummy2;
1933       nopeel_outside_cost += vect_get_known_peeling_cost
1934         (loop_vinfo, 0, &dummy2,
1935          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1936          &prologue_cost_vec, &epilogue_cost_vec);
1937
1938       prologue_cost_vec.release ();
1939       epilogue_cost_vec.release ();
1940
1941       npeel = best_peel.peel_info.npeel;
1942       dr0 = best_peel.peel_info.dr;
1943
1944       /* If no peeling is not more expensive than the best peeling we
1945          have so far, don't perform any peeling.  */
1946       if (nopeel_inside_cost <= best_peel.inside_cost)
1947         do_peeling = false;
1948     }
1949
1950   if (do_peeling)
1951     {
1952       stmt = DR_STMT (dr0);
1953       stmt_info = vinfo_for_stmt (stmt);
1954       vectype = STMT_VINFO_VECTYPE (stmt_info);
1955
1956       if (known_alignment_for_access_p (dr0))
1957         {
1958           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1959                                                 size_zero_node) < 0;
1960           if (!npeel)
1961             {
1962               /* Since it's known at compile time, compute the number of
1963                  iterations in the peeled loop (the peeling factor) for use in
1964                  updating DR_MISALIGNMENT values.  The peeling factor is the
1965                  vectorization factor minus the misalignment as an element
1966                  count.  */
1967               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1968               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1969               npeel = ((mis & (target_align - 1))
1970                        / vect_get_scalar_dr_size (dr0));
1971             }
1972
1973           /* For interleaved data access every iteration accesses all the
1974              members of the group, therefore we divide the number of iterations
1975              by the group size.  */
1976           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1977           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1978             npeel /= GROUP_SIZE (stmt_info);
1979
1980           if (dump_enabled_p ())
1981             dump_printf_loc (MSG_NOTE, vect_location,
1982                              "Try peeling by %d\n", npeel);
1983         }
1984
1985       /* Ensure that all datarefs can be vectorized after the peel.  */
1986       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1987         do_peeling = false;
1988
1989       /* Check if all datarefs are supportable and log.  */
1990       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1991         {
1992           stat = vect_verify_datarefs_alignment (loop_vinfo);
1993           if (!stat)
1994             do_peeling = false;
1995           else
1996             return stat;
1997         }
1998
1999       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2000       if (do_peeling)
2001         {
2002           unsigned max_allowed_peel
2003             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2004           if (max_allowed_peel != (unsigned)-1)
2005             {
2006               unsigned max_peel = npeel;
2007               if (max_peel == 0)
2008                 {
2009                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2010                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2011                 }
2012               if (max_peel > max_allowed_peel)
2013                 {
2014                   do_peeling = false;
2015                   if (dump_enabled_p ())
2016                     dump_printf_loc (MSG_NOTE, vect_location,
2017                         "Disable peeling, max peels reached: %d\n", max_peel);
2018                 }
2019             }
2020         }
2021
2022       /* Cost model #2 - if peeling may result in a remaining loop not
2023          iterating enough to be vectorized then do not peel.  */
2024       if (do_peeling
2025           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2026         {
2027           unsigned max_peel
2028             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
2029           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
2030               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
2031             do_peeling = false;
2032         }
2033
2034       if (do_peeling)
2035         {
2036           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2037              If the misalignment of DR_i is identical to that of dr0 then set
2038              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2039              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2040              by the peeling factor times the element size of DR_i (MOD the
2041              vectorization factor times the size).  Otherwise, the
2042              misalignment of DR_i must be set to unknown.  */
2043           FOR_EACH_VEC_ELT (datarefs, i, dr)
2044             if (dr != dr0)
2045               {
2046                 /* Strided accesses perform only component accesses, alignment
2047                    is irrelevant for them.  */
2048                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2049                 if (STMT_VINFO_STRIDED_P (stmt_info)
2050                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2051                   continue;
2052
2053                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2054               }
2055
2056           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2057           if (npeel)
2058             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2059           else
2060             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2061               = DR_MISALIGNMENT (dr0);
2062           SET_DR_MISALIGNMENT (dr0, 0);
2063           if (dump_enabled_p ())
2064             {
2065               dump_printf_loc (MSG_NOTE, vect_location,
2066                                "Alignment of access forced using peeling.\n");
2067               dump_printf_loc (MSG_NOTE, vect_location,
2068                                "Peeling for alignment will be applied.\n");
2069             }
2070
2071           /* The inside-loop cost will be accounted for in vectorizable_load
2072              and vectorizable_store correctly with adjusted alignments.
2073              Drop the body_cst_vec on the floor here.  */
2074           stat = vect_verify_datarefs_alignment (loop_vinfo);
2075           gcc_assert (stat);
2076           return stat;
2077         }
2078     }
2079
2080   /* (2) Versioning to force alignment.  */
2081
2082   /* Try versioning if:
2083      1) optimize loop for speed
2084      2) there is at least one unsupported misaligned data ref with an unknown
2085         misalignment, and
2086      3) all misaligned data refs with a known misalignment are supported, and
2087      4) the number of runtime alignment checks is within reason.  */
2088
2089   do_versioning =
2090         optimize_loop_nest_for_speed_p (loop)
2091         && (!loop->inner); /* FORNOW */
2092
2093   if (do_versioning)
2094     {
2095       FOR_EACH_VEC_ELT (datarefs, i, dr)
2096         {
2097           stmt = DR_STMT (dr);
2098           stmt_info = vinfo_for_stmt (stmt);
2099
2100           /* For interleaving, only the alignment of the first access
2101              matters.  */
2102           if (aligned_access_p (dr)
2103               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2104                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2105             continue;
2106
2107           if (STMT_VINFO_STRIDED_P (stmt_info))
2108             {
2109               /* Strided loads perform only component accesses, alignment is
2110                  irrelevant for them.  */
2111               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2112                 continue;
2113               do_versioning = false;
2114               break;
2115             }
2116
2117           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2118
2119           if (!supportable_dr_alignment)
2120             {
2121               gimple *stmt;
2122               int mask;
2123               tree vectype;
2124
2125               if (known_alignment_for_access_p (dr)
2126                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2127                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2128                 {
2129                   do_versioning = false;
2130                   break;
2131                 }
2132
2133               stmt = DR_STMT (dr);
2134               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2135               gcc_assert (vectype);
2136
2137               /* The rightmost bits of an aligned address must be zeros.
2138                  Construct the mask needed for this test.  For example,
2139                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2140                  mask must be 15 = 0xf. */
2141               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2142
2143               /* FORNOW: use the same mask to test all potentially unaligned
2144                  references in the loop.  The vectorizer currently supports
2145                  a single vector size, see the reference to
2146                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2147                  vectorization factor is computed.  */
2148               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2149                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2150               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2151               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2152                       DR_STMT (dr));
2153             }
2154         }
2155
2156       /* Versioning requires at least one misaligned data reference.  */
2157       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2158         do_versioning = false;
2159       else if (!do_versioning)
2160         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2161     }
2162
2163   if (do_versioning)
2164     {
2165       vec<gimple *> may_misalign_stmts
2166         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2167       gimple *stmt;
2168
2169       /* It can now be assumed that the data references in the statements
2170          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2171          of the loop being vectorized.  */
2172       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2173         {
2174           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2175           dr = STMT_VINFO_DATA_REF (stmt_info);
2176           SET_DR_MISALIGNMENT (dr, 0);
2177           if (dump_enabled_p ())
2178             dump_printf_loc (MSG_NOTE, vect_location,
2179                              "Alignment of access forced using versioning.\n");
2180         }
2181
2182       if (dump_enabled_p ())
2183         dump_printf_loc (MSG_NOTE, vect_location,
2184                          "Versioning for alignment will be applied.\n");
2185
2186       /* Peeling and versioning can't be done together at this time.  */
2187       gcc_assert (! (do_peeling && do_versioning));
2188
2189       stat = vect_verify_datarefs_alignment (loop_vinfo);
2190       gcc_assert (stat);
2191       return stat;
2192     }
2193
2194   /* This point is reached if neither peeling nor versioning is being done.  */
2195   gcc_assert (! (do_peeling || do_versioning));
2196
2197   stat = vect_verify_datarefs_alignment (loop_vinfo);
2198   return stat;
2199 }
2200
2201
2202 /* Function vect_find_same_alignment_drs.
2203
2204    Update group and alignment relations according to the chosen
2205    vectorization factor.  */
2206
2207 static void
2208 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2209 {
2210   struct data_reference *dra = DDR_A (ddr);
2211   struct data_reference *drb = DDR_B (ddr);
2212   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2213   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2214
2215   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2216     return;
2217
2218   if (dra == drb)
2219     return;
2220
2221   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2222       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2223       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2224     return;
2225
2226   /* Two references with distance zero have the same alignment.  */
2227   offset_int diff = (wi::to_offset (DR_INIT (dra))
2228                      - wi::to_offset (DR_INIT (drb)));
2229   if (diff != 0)
2230     {
2231       /* Get the wider of the two alignments.  */
2232       unsigned int align_a = (vect_calculate_target_alignment (dra)
2233                               / BITS_PER_UNIT);
2234       unsigned int align_b = (vect_calculate_target_alignment (drb)
2235                               / BITS_PER_UNIT);
2236       unsigned int max_align = MAX (align_a, align_b);
2237
2238       /* Require the gap to be a multiple of the larger vector alignment.  */
2239       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2240         return;
2241     }
2242
2243   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2244   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2245   if (dump_enabled_p ())
2246     {
2247       dump_printf_loc (MSG_NOTE, vect_location,
2248                        "accesses have the same alignment: ");
2249       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2250       dump_printf (MSG_NOTE,  " and ");
2251       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2252       dump_printf (MSG_NOTE, "\n");
2253     }
2254 }
2255
2256
2257 /* Function vect_analyze_data_refs_alignment
2258
2259    Analyze the alignment of the data-references in the loop.
2260    Return FALSE if a data reference is found that cannot be vectorized.  */
2261
2262 bool
2263 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2264 {
2265   if (dump_enabled_p ())
2266     dump_printf_loc (MSG_NOTE, vect_location,
2267                      "=== vect_analyze_data_refs_alignment ===\n");
2268
2269   /* Mark groups of data references with same alignment using
2270      data dependence information.  */
2271   vec<ddr_p> ddrs = vinfo->ddrs;
2272   struct data_dependence_relation *ddr;
2273   unsigned int i;
2274
2275   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2276     vect_find_same_alignment_drs (ddr);
2277
2278   vec<data_reference_p> datarefs = vinfo->datarefs;
2279   struct data_reference *dr;
2280
2281   vect_record_base_alignments (vinfo);
2282   FOR_EACH_VEC_ELT (datarefs, i, dr)
2283     {
2284       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2285       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2286           && !vect_compute_data_ref_alignment (dr))
2287         {
2288           /* Strided accesses perform only component accesses, misalignment
2289              information is irrelevant for them.  */
2290           if (STMT_VINFO_STRIDED_P (stmt_info)
2291               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2292             continue;
2293
2294           if (dump_enabled_p ())
2295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296                              "not vectorized: can't calculate alignment "
2297                              "for data ref.\n");
2298
2299           return false;
2300         }
2301     }
2302
2303   return true;
2304 }
2305
2306
2307 /* Analyze alignment of DRs of stmts in NODE.  */
2308
2309 static bool
2310 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2311 {
2312   /* We vectorize from the first scalar stmt in the node unless
2313      the node is permuted in which case we start from the first
2314      element in the group.  */
2315   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2316   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2317   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2318     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2319
2320   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2321   if (! vect_compute_data_ref_alignment (dr)
2322       /* For creating the data-ref pointer we need alignment of the
2323          first element anyway.  */
2324       || (dr != first_dr
2325           && ! vect_compute_data_ref_alignment (first_dr))
2326       || ! verify_data_ref_alignment (dr))
2327     {
2328       if (dump_enabled_p ())
2329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330                          "not vectorized: bad data alignment in basic "
2331                          "block.\n");
2332       return false;
2333     }
2334
2335   return true;
2336 }
2337
2338 /* Function vect_slp_analyze_instance_alignment
2339
2340    Analyze the alignment of the data-references in the SLP instance.
2341    Return FALSE if a data reference is found that cannot be vectorized.  */
2342
2343 bool
2344 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2345 {
2346   if (dump_enabled_p ())
2347     dump_printf_loc (MSG_NOTE, vect_location,
2348                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2349
2350   slp_tree node;
2351   unsigned i;
2352   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2353     if (! vect_slp_analyze_and_verify_node_alignment (node))
2354       return false;
2355
2356   node = SLP_INSTANCE_TREE (instance);
2357   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2358       && ! vect_slp_analyze_and_verify_node_alignment
2359              (SLP_INSTANCE_TREE (instance)))
2360     return false;
2361
2362   return true;
2363 }
2364
2365
2366 /* Analyze groups of accesses: check that DR belongs to a group of
2367    accesses of legal size, step, etc.  Detect gaps, single element
2368    interleaving, and other special cases. Set grouped access info.
2369    Collect groups of strided stores for further use in SLP analysis.
2370    Worker for vect_analyze_group_access.  */
2371
2372 static bool
2373 vect_analyze_group_access_1 (struct data_reference *dr)
2374 {
2375   tree step = DR_STEP (dr);
2376   tree scalar_type = TREE_TYPE (DR_REF (dr));
2377   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2378   gimple *stmt = DR_STMT (dr);
2379   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2380   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2381   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2382   HOST_WIDE_INT dr_step = -1;
2383   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2384   bool slp_impossible = false;
2385
2386   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2387      size of the interleaving group (including gaps).  */
2388   if (tree_fits_shwi_p (step))
2389     {
2390       dr_step = tree_to_shwi (step);
2391       /* Check that STEP is a multiple of type size.  Otherwise there is
2392          a non-element-sized gap at the end of the group which we
2393          cannot represent in GROUP_GAP or GROUP_SIZE.
2394          ???  As we can handle non-constant step fine here we should
2395          simply remove uses of GROUP_GAP between the last and first
2396          element and instead rely on DR_STEP.  GROUP_SIZE then would
2397          simply not include that gap.  */
2398       if ((dr_step % type_size) != 0)
2399         {
2400           if (dump_enabled_p ())
2401             {
2402               dump_printf_loc (MSG_NOTE, vect_location,
2403                                "Step ");
2404               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2405               dump_printf (MSG_NOTE,
2406                            " is not a multiple of the element size for ");
2407               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2408               dump_printf (MSG_NOTE, "\n");
2409             }
2410           return false;
2411         }
2412       groupsize = absu_hwi (dr_step) / type_size;
2413     }
2414   else
2415     groupsize = 0;
2416
2417   /* Not consecutive access is possible only if it is a part of interleaving.  */
2418   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2419     {
2420       /* Check if it this DR is a part of interleaving, and is a single
2421          element of the group that is accessed in the loop.  */
2422
2423       /* Gaps are supported only for loads. STEP must be a multiple of the type
2424          size.  The size of the group must be a power of 2.  */
2425       if (DR_IS_READ (dr)
2426           && (dr_step % type_size) == 0
2427           && groupsize > 0
2428           && pow2p_hwi (groupsize))
2429         {
2430           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2431           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2432           GROUP_GAP (stmt_info) = groupsize - 1;
2433           if (dump_enabled_p ())
2434             {
2435               dump_printf_loc (MSG_NOTE, vect_location,
2436                                "Detected single element interleaving ");
2437               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2438               dump_printf (MSG_NOTE, " step ");
2439               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2440               dump_printf (MSG_NOTE, "\n");
2441             }
2442
2443           return true;
2444         }
2445
2446       if (dump_enabled_p ())
2447         {
2448           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2449                            "not consecutive access ");
2450           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2451         }
2452
2453       if (bb_vinfo)
2454         {
2455           /* Mark the statement as unvectorizable.  */
2456           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2457           return true;
2458         }
2459
2460       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2461       STMT_VINFO_STRIDED_P (stmt_info) = true;
2462       return true;
2463     }
2464
2465   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2466     {
2467       /* First stmt in the interleaving chain. Check the chain.  */
2468       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2469       struct data_reference *data_ref = dr;
2470       unsigned int count = 1;
2471       tree prev_init = DR_INIT (data_ref);
2472       gimple *prev = stmt;
2473       HOST_WIDE_INT diff, gaps = 0;
2474
2475       while (next)
2476         {
2477           /* Skip same data-refs.  In case that two or more stmts share
2478              data-ref (supported only for loads), we vectorize only the first
2479              stmt, and the rest get their vectorized loads from the first
2480              one.  */
2481           if (!tree_int_cst_compare (DR_INIT (data_ref),
2482                                      DR_INIT (STMT_VINFO_DATA_REF (
2483                                                    vinfo_for_stmt (next)))))
2484             {
2485               if (DR_IS_WRITE (data_ref))
2486                 {
2487                   if (dump_enabled_p ())
2488                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2489                                      "Two store stmts share the same dr.\n");
2490                   return false;
2491                 }
2492
2493               if (dump_enabled_p ())
2494                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495                                  "Two or more load stmts share the same dr.\n");
2496
2497               /* For load use the same data-ref load.  */
2498               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2499
2500               prev = next;
2501               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2502               continue;
2503             }
2504
2505           prev = next;
2506           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2507
2508           /* All group members have the same STEP by construction.  */
2509           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2510
2511           /* Check that the distance between two accesses is equal to the type
2512              size. Otherwise, we have gaps.  */
2513           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2514                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2515           if (diff != 1)
2516             {
2517               /* FORNOW: SLP of accesses with gaps is not supported.  */
2518               slp_impossible = true;
2519               if (DR_IS_WRITE (data_ref))
2520                 {
2521                   if (dump_enabled_p ())
2522                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523                                      "interleaved store with gaps\n");
2524                   return false;
2525                 }
2526
2527               gaps += diff - 1;
2528             }
2529
2530           last_accessed_element += diff;
2531
2532           /* Store the gap from the previous member of the group. If there is no
2533              gap in the access, GROUP_GAP is always 1.  */
2534           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2535
2536           prev_init = DR_INIT (data_ref);
2537           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2538           /* Count the number of data-refs in the chain.  */
2539           count++;
2540         }
2541
2542       if (groupsize == 0)
2543         groupsize = count + gaps;
2544
2545       /* This could be UINT_MAX but as we are generating code in a very
2546          inefficient way we have to cap earlier.  See PR78699 for example.  */
2547       if (groupsize > 4096)
2548         {
2549           if (dump_enabled_p ())
2550             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2551                              "group is too large\n");
2552           return false;
2553         }
2554
2555       /* Check that the size of the interleaving is equal to count for stores,
2556          i.e., that there are no gaps.  */
2557       if (groupsize != count
2558           && !DR_IS_READ (dr))
2559         {
2560           if (dump_enabled_p ())
2561             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2562                              "interleaved store with gaps\n");
2563           return false;
2564         }
2565
2566       /* If there is a gap after the last load in the group it is the
2567          difference between the groupsize and the last accessed
2568          element.
2569          When there is no gap, this difference should be 0.  */
2570       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2571
2572       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2573       if (dump_enabled_p ())
2574         {
2575           dump_printf_loc (MSG_NOTE, vect_location,
2576                            "Detected interleaving ");
2577           if (DR_IS_READ (dr))
2578             dump_printf (MSG_NOTE, "load ");
2579           else
2580             dump_printf (MSG_NOTE, "store ");
2581           dump_printf (MSG_NOTE, "of size %u starting with ",
2582                        (unsigned)groupsize);
2583           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2584           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2585             dump_printf_loc (MSG_NOTE, vect_location,
2586                              "There is a gap of %u elements after the group\n",
2587                              GROUP_GAP (vinfo_for_stmt (stmt)));
2588         }
2589
2590       /* SLP: create an SLP data structure for every interleaving group of
2591          stores for further analysis in vect_analyse_slp.  */
2592       if (DR_IS_WRITE (dr) && !slp_impossible)
2593         {
2594           if (loop_vinfo)
2595             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2596           if (bb_vinfo)
2597             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2598         }
2599     }
2600
2601   return true;
2602 }
2603
2604 /* Analyze groups of accesses: check that DR belongs to a group of
2605    accesses of legal size, step, etc.  Detect gaps, single element
2606    interleaving, and other special cases. Set grouped access info.
2607    Collect groups of strided stores for further use in SLP analysis.  */
2608
2609 static bool
2610 vect_analyze_group_access (struct data_reference *dr)
2611 {
2612   if (!vect_analyze_group_access_1 (dr))
2613     {
2614       /* Dissolve the group if present.  */
2615       gimple *next;
2616       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2617       while (stmt)
2618         {
2619           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2620           next = GROUP_NEXT_ELEMENT (vinfo);
2621           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2622           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2623           stmt = next;
2624         }
2625       return false;
2626     }
2627   return true;
2628 }
2629
2630 /* Analyze the access pattern of the data-reference DR.
2631    In case of non-consecutive accesses call vect_analyze_group_access() to
2632    analyze groups of accesses.  */
2633
2634 static bool
2635 vect_analyze_data_ref_access (struct data_reference *dr)
2636 {
2637   tree step = DR_STEP (dr);
2638   tree scalar_type = TREE_TYPE (DR_REF (dr));
2639   gimple *stmt = DR_STMT (dr);
2640   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2641   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2642   struct loop *loop = NULL;
2643
2644   if (loop_vinfo)
2645     loop = LOOP_VINFO_LOOP (loop_vinfo);
2646
2647   if (loop_vinfo && !step)
2648     {
2649       if (dump_enabled_p ())
2650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2651                          "bad data-ref access in loop\n");
2652       return false;
2653     }
2654
2655   /* Allow loads with zero step in inner-loop vectorization.  */
2656   if (loop_vinfo && integer_zerop (step))
2657     {
2658       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2659       if (!nested_in_vect_loop_p (loop, stmt))
2660         return DR_IS_READ (dr);
2661       /* Allow references with zero step for outer loops marked
2662          with pragma omp simd only - it guarantees absence of
2663          loop-carried dependencies between inner loop iterations.  */
2664       if (!loop->force_vectorize)
2665         {
2666           if (dump_enabled_p ())
2667             dump_printf_loc (MSG_NOTE, vect_location,
2668                              "zero step in inner loop of nest\n");
2669           return false;
2670         }
2671     }
2672
2673   if (loop && nested_in_vect_loop_p (loop, stmt))
2674     {
2675       /* Interleaved accesses are not yet supported within outer-loop
2676         vectorization for references in the inner-loop.  */
2677       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2678
2679       /* For the rest of the analysis we use the outer-loop step.  */
2680       step = STMT_VINFO_DR_STEP (stmt_info);
2681       if (integer_zerop (step))
2682         {
2683           if (dump_enabled_p ())
2684             dump_printf_loc (MSG_NOTE, vect_location,
2685                              "zero step in outer loop.\n");
2686           return DR_IS_READ (dr);
2687         }
2688     }
2689
2690   /* Consecutive?  */
2691   if (TREE_CODE (step) == INTEGER_CST)
2692     {
2693       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2694       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2695           || (dr_step < 0
2696               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2697         {
2698           /* Mark that it is not interleaving.  */
2699           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2700           return true;
2701         }
2702     }
2703
2704   if (loop && nested_in_vect_loop_p (loop, stmt))
2705     {
2706       if (dump_enabled_p ())
2707         dump_printf_loc (MSG_NOTE, vect_location,
2708                          "grouped access in outer loop.\n");
2709       return false;
2710     }
2711
2712
2713   /* Assume this is a DR handled by non-constant strided load case.  */
2714   if (TREE_CODE (step) != INTEGER_CST)
2715     return (STMT_VINFO_STRIDED_P (stmt_info)
2716             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2717                 || vect_analyze_group_access (dr)));
2718
2719   /* Not consecutive access - check if it's a part of interleaving group.  */
2720   return vect_analyze_group_access (dr);
2721 }
2722
2723 /* Compare two data-references DRA and DRB to group them into chunks
2724    suitable for grouping.  */
2725
2726 static int
2727 dr_group_sort_cmp (const void *dra_, const void *drb_)
2728 {
2729   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2730   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2731   int cmp;
2732
2733   /* Stabilize sort.  */
2734   if (dra == drb)
2735     return 0;
2736
2737   /* DRs in different loops never belong to the same group.  */
2738   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2739   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2740   if (loopa != loopb)
2741     return loopa->num < loopb->num ? -1 : 1;
2742
2743   /* Ordering of DRs according to base.  */
2744   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2745                                DR_BASE_ADDRESS (drb));
2746   if (cmp != 0)
2747     return cmp;
2748
2749   /* And according to DR_OFFSET.  */
2750   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2751   if (cmp != 0)
2752     return cmp;
2753
2754   /* Put reads before writes.  */
2755   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2756     return DR_IS_READ (dra) ? -1 : 1;
2757
2758   /* Then sort after access size.  */
2759   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2760                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2761   if (cmp != 0)
2762     return cmp;
2763
2764   /* And after step.  */
2765   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2766   if (cmp != 0)
2767     return cmp;
2768
2769   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2770   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2771   if (cmp == 0)
2772     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2773   return cmp;
2774 }
2775
2776 /* Function vect_analyze_data_ref_accesses.
2777
2778    Analyze the access pattern of all the data references in the loop.
2779
2780    FORNOW: the only access pattern that is considered vectorizable is a
2781            simple step 1 (consecutive) access.
2782
2783    FORNOW: handle only arrays and pointer accesses.  */
2784
2785 bool
2786 vect_analyze_data_ref_accesses (vec_info *vinfo)
2787 {
2788   unsigned int i;
2789   vec<data_reference_p> datarefs = vinfo->datarefs;
2790   struct data_reference *dr;
2791
2792   if (dump_enabled_p ())
2793     dump_printf_loc (MSG_NOTE, vect_location,
2794                      "=== vect_analyze_data_ref_accesses ===\n");
2795
2796   if (datarefs.is_empty ())
2797     return true;
2798
2799   /* Sort the array of datarefs to make building the interleaving chains
2800      linear.  Don't modify the original vector's order, it is needed for
2801      determining what dependencies are reversed.  */
2802   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2803   datarefs_copy.qsort (dr_group_sort_cmp);
2804
2805   /* Build the interleaving chains.  */
2806   for (i = 0; i < datarefs_copy.length () - 1;)
2807     {
2808       data_reference_p dra = datarefs_copy[i];
2809       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2810       stmt_vec_info lastinfo = NULL;
2811       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2812         {
2813           ++i;
2814           continue;
2815         }
2816       for (i = i + 1; i < datarefs_copy.length (); ++i)
2817         {
2818           data_reference_p drb = datarefs_copy[i];
2819           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2820           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2821             break;
2822
2823           /* ???  Imperfect sorting (non-compatible types, non-modulo
2824              accesses, same accesses) can lead to a group to be artificially
2825              split here as we don't just skip over those.  If it really
2826              matters we can push those to a worklist and re-iterate
2827              over them.  The we can just skip ahead to the next DR here.  */
2828
2829           /* DRs in a different loop should not be put into the same
2830              interleaving group.  */
2831           if (gimple_bb (DR_STMT (dra))->loop_father
2832               != gimple_bb (DR_STMT (drb))->loop_father)
2833             break;
2834
2835           /* Check that the data-refs have same first location (except init)
2836              and they are both either store or load (not load and store,
2837              not masked loads or stores).  */
2838           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2839               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2840                                         DR_BASE_ADDRESS (drb)) != 0
2841               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2842               || !gimple_assign_single_p (DR_STMT (dra))
2843               || !gimple_assign_single_p (DR_STMT (drb)))
2844             break;
2845
2846           /* Check that the data-refs have the same constant size.  */
2847           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2848           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2849           if (!tree_fits_uhwi_p (sza)
2850               || !tree_fits_uhwi_p (szb)
2851               || !tree_int_cst_equal (sza, szb))
2852             break;
2853
2854           /* Check that the data-refs have the same step.  */
2855           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2856             break;
2857
2858           /* Check the types are compatible.
2859              ???  We don't distinguish this during sorting.  */
2860           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2861                                    TREE_TYPE (DR_REF (drb))))
2862             break;
2863
2864           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2865           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2866           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2867           HOST_WIDE_INT init_prev
2868             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2869           gcc_assert (init_a <= init_b
2870                       && init_a <= init_prev
2871                       && init_prev <= init_b);
2872
2873           /* Do not place the same access in the interleaving chain twice.  */
2874           if (init_b == init_prev)
2875             {
2876               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2877                           < gimple_uid (DR_STMT (drb)));
2878               /* ???  For now we simply "drop" the later reference which is
2879                  otherwise the same rather than finishing off this group.
2880                  In the end we'd want to re-process duplicates forming
2881                  multiple groups from the refs, likely by just collecting
2882                  all candidates (including duplicates and split points
2883                  below) in a vector and then process them together.  */
2884               continue;
2885             }
2886
2887           /* If init_b == init_a + the size of the type * k, we have an
2888              interleaving, and DRA is accessed before DRB.  */
2889           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2890           if (type_size_a == 0
2891               || (init_b - init_a) % type_size_a != 0)
2892             break;
2893
2894           /* If we have a store, the accesses are adjacent.  This splits
2895              groups into chunks we support (we don't support vectorization
2896              of stores with gaps).  */
2897           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2898             break;
2899
2900           /* If the step (if not zero or non-constant) is greater than the
2901              difference between data-refs' inits this splits groups into
2902              suitable sizes.  */
2903           if (tree_fits_shwi_p (DR_STEP (dra)))
2904             {
2905               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2906               if (step != 0 && step <= (init_b - init_a))
2907                 break;
2908             }
2909
2910           if (dump_enabled_p ())
2911             {
2912               dump_printf_loc (MSG_NOTE, vect_location,
2913                                "Detected interleaving ");
2914               if (DR_IS_READ (dra))
2915                 dump_printf (MSG_NOTE, "load ");
2916               else
2917                 dump_printf (MSG_NOTE, "store ");
2918               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2919               dump_printf (MSG_NOTE,  " and ");
2920               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2921               dump_printf (MSG_NOTE, "\n");
2922             }
2923
2924           /* Link the found element into the group list.  */
2925           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2926             {
2927               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2928               lastinfo = stmtinfo_a;
2929             }
2930           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2931           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2932           lastinfo = stmtinfo_b;
2933         }
2934     }
2935
2936   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2937     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2938         && !vect_analyze_data_ref_access (dr))
2939       {
2940         if (dump_enabled_p ())
2941           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2942                            "not vectorized: complicated access pattern.\n");
2943
2944         if (is_a <bb_vec_info> (vinfo))
2945           {
2946             /* Mark the statement as not vectorizable.  */
2947             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2948             continue;
2949           }
2950         else
2951           {
2952             datarefs_copy.release ();
2953             return false;
2954           }
2955       }
2956
2957   datarefs_copy.release ();
2958   return true;
2959 }
2960
2961 /* Function vect_vfa_segment_size.
2962
2963    Create an expression that computes the size of segment
2964    that will be accessed for a data reference.  The functions takes into
2965    account that realignment loads may access one more vector.
2966
2967    Input:
2968      DR: The data reference.
2969      LENGTH_FACTOR: segment length to consider.
2970
2971    Return an expression whose value is the size of segment which will be
2972    accessed by DR.  */
2973
2974 static tree
2975 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2976 {
2977   tree segment_length;
2978
2979   if (integer_zerop (DR_STEP (dr)))
2980     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2981   else
2982     segment_length = size_binop (MULT_EXPR,
2983                                  fold_convert (sizetype, DR_STEP (dr)),
2984                                  fold_convert (sizetype, length_factor));
2985
2986   if (vect_supportable_dr_alignment (dr, false)
2987         == dr_explicit_realign_optimized)
2988     {
2989       tree vector_size = TYPE_SIZE_UNIT
2990                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2991
2992       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2993     }
2994   return segment_length;
2995 }
2996
2997 /* Function vect_no_alias_p.
2998
2999    Given data references A and B with equal base and offset, the alias
3000    relation can be decided at compilation time, return TRUE if they do
3001    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
3002    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3003    respectively.  */
3004
3005 static bool
3006 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
3007                  tree segment_length_a, tree segment_length_b)
3008 {
3009   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3010               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3011   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3012     return false;
3013
3014   tree seg_a_min = DR_INIT (a);
3015   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3016                                 seg_a_min, segment_length_a);
3017   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3018      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3019      [a, a+12) */
3020   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3021     {
3022       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3023       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3024                                seg_a_max, unit_size);
3025       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3026                                DR_INIT (a), unit_size);
3027     }
3028   tree seg_b_min = DR_INIT (b);
3029   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3030                                 seg_b_min, segment_length_b);
3031   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3032     {
3033       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3034       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3035                                seg_b_max, unit_size);
3036       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3037                                DR_INIT (b), unit_size);
3038     }
3039
3040   if (tree_int_cst_le (seg_a_max, seg_b_min)
3041       || tree_int_cst_le (seg_b_max, seg_a_min))
3042     return true;
3043
3044   return false;
3045 }
3046
3047 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3048    in DDR is >= VF.  */
3049
3050 static bool
3051 dependence_distance_ge_vf (data_dependence_relation *ddr,
3052                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3053 {
3054   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3055       || DDR_NUM_DIST_VECTS (ddr) == 0)
3056     return false;
3057
3058   /* If the dependence is exact, we should have limited the VF instead.  */
3059   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3060
3061   unsigned int i;
3062   lambda_vector dist_v;
3063   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3064     {
3065       HOST_WIDE_INT dist = dist_v[loop_depth];
3066       if (dist != 0
3067           && !(dist > 0 && DDR_REVERSED_P (ddr))
3068           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3069         return false;
3070     }
3071
3072   if (dump_enabled_p ())
3073     {
3074       dump_printf_loc (MSG_NOTE, vect_location,
3075                        "dependence distance between ");
3076       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3077       dump_printf (MSG_NOTE,  " and ");
3078       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3079       dump_printf (MSG_NOTE,  " is >= VF\n");
3080     }
3081
3082   return true;
3083 }
3084
3085 /* Function vect_prune_runtime_alias_test_list.
3086
3087    Prune a list of ddrs to be tested at run-time by versioning for alias.
3088    Merge several alias checks into one if possible.
3089    Return FALSE if resulting list of ddrs is longer then allowed by
3090    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3091
3092 bool
3093 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3094 {
3095   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3096   hash_set <tree_pair_hash> compared_objects;
3097
3098   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3099   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3100     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3101   vec<vec_object_pair> &check_unequal_addrs
3102     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3103   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3104   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3105
3106   ddr_p ddr;
3107   unsigned int i;
3108   tree length_factor;
3109
3110   if (dump_enabled_p ())
3111     dump_printf_loc (MSG_NOTE, vect_location,
3112                      "=== vect_prune_runtime_alias_test_list ===\n");
3113
3114   if (may_alias_ddrs.is_empty ())
3115     return true;
3116
3117   comp_alias_ddrs.create (may_alias_ddrs.length ());
3118
3119   unsigned int loop_depth
3120     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3121                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3122
3123   /* First, we collect all data ref pairs for aliasing checks.  */
3124   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3125     {
3126       int comp_res;
3127       struct data_reference *dr_a, *dr_b;
3128       gimple *dr_group_first_a, *dr_group_first_b;
3129       tree segment_length_a, segment_length_b;
3130       gimple *stmt_a, *stmt_b;
3131
3132       /* Ignore the alias if the VF we chose ended up being no greater
3133          than the dependence distance.  */
3134       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3135         continue;
3136
3137       if (DDR_OBJECT_A (ddr))
3138         {
3139           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3140           if (!compared_objects.add (new_pair))
3141             {
3142               if (dump_enabled_p ())
3143                 {
3144                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3145                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3146                   dump_printf (MSG_NOTE, " and ");
3147                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3148                   dump_printf (MSG_NOTE, " have different addresses\n");
3149                 }
3150               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3151             }
3152           continue;
3153         }
3154
3155       dr_a = DDR_A (ddr);
3156       stmt_a = DR_STMT (DDR_A (ddr));
3157       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3158       if (dr_group_first_a)
3159         {
3160           stmt_a = dr_group_first_a;
3161           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3162         }
3163
3164       dr_b = DDR_B (ddr);
3165       stmt_b = DR_STMT (DDR_B (ddr));
3166       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3167       if (dr_group_first_b)
3168         {
3169           stmt_b = dr_group_first_b;
3170           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3171         }
3172
3173       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3174         length_factor = scalar_loop_iters;
3175       else
3176         length_factor = size_int (vect_factor);
3177       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3178       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3179
3180       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3181                                         DR_BASE_ADDRESS (dr_b));
3182       if (comp_res == 0)
3183         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3184                                           DR_OFFSET (dr_b));
3185
3186       /* Alias is known at compilation time.  */
3187       if (comp_res == 0
3188           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3189           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3190           && TREE_CODE (segment_length_a) == INTEGER_CST
3191           && TREE_CODE (segment_length_b) == INTEGER_CST)
3192         {
3193           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3194             continue;
3195
3196           if (dump_enabled_p ())
3197             dump_printf_loc (MSG_NOTE, vect_location,
3198                              "not vectorized: compilation time alias.\n");
3199
3200           return false;
3201         }
3202
3203       dr_with_seg_len_pair_t dr_with_seg_len_pair
3204           (dr_with_seg_len (dr_a, segment_length_a),
3205            dr_with_seg_len (dr_b, segment_length_b));
3206
3207       /* Canonicalize pairs by sorting the two DR members.  */
3208       if (comp_res > 0)
3209         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3210
3211       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3212     }
3213
3214   prune_runtime_alias_test_list (&comp_alias_ddrs,
3215                                  (unsigned HOST_WIDE_INT) vect_factor);
3216
3217   unsigned int count = (comp_alias_ddrs.length ()
3218                         + check_unequal_addrs.length ());
3219   dump_printf_loc (MSG_NOTE, vect_location,
3220                    "improved number of alias checks from %d to %d\n",
3221                    may_alias_ddrs.length (), count);
3222   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3223     {
3224       if (dump_enabled_p ())
3225         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3226                          "number of versioning for alias "
3227                          "run-time tests exceeds %d "
3228                          "(--param vect-max-version-for-alias-checks)\n",
3229                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3230       return false;
3231     }
3232
3233   return true;
3234 }
3235
3236 /* Return true if a non-affine read or write in STMT is suitable for a
3237    gather load or scatter store.  Describe the operation in *INFO if so.  */
3238
3239 bool
3240 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3241                            gather_scatter_info *info)
3242 {
3243   HOST_WIDE_INT scale = 1;
3244   poly_int64 pbitpos, pbitsize;
3245   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3246   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3247   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3248   tree offtype = NULL_TREE;
3249   tree decl, base, off;
3250   machine_mode pmode;
3251   int punsignedp, reversep, pvolatilep = 0;
3252
3253   base = DR_REF (dr);
3254   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3255      see if we can use the def stmt of the address.  */
3256   if (is_gimple_call (stmt)
3257       && gimple_call_internal_p (stmt)
3258       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3259           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3260       && TREE_CODE (base) == MEM_REF
3261       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3262       && integer_zerop (TREE_OPERAND (base, 1))
3263       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3264     {
3265       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3266       if (is_gimple_assign (def_stmt)
3267           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3268         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3269     }
3270
3271   /* The gather and scatter builtins need address of the form
3272      loop_invariant + vector * {1, 2, 4, 8}
3273      or
3274      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3275      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3276      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3277      multiplications and additions in it.  To get a vector, we need
3278      a single SSA_NAME that will be defined in the loop and will
3279      contain everything that is not loop invariant and that can be
3280      vectorized.  The following code attempts to find such a preexistng
3281      SSA_NAME OFF and put the loop invariants into a tree BASE
3282      that can be gimplified before the loop.  */
3283   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3284                               &punsignedp, &reversep, &pvolatilep);
3285   gcc_assert (base && !reversep);
3286   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3287
3288   if (TREE_CODE (base) == MEM_REF)
3289     {
3290       if (!integer_zerop (TREE_OPERAND (base, 1)))
3291         {
3292           if (off == NULL_TREE)
3293             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3294           else
3295             off = size_binop (PLUS_EXPR, off,
3296                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3297         }
3298       base = TREE_OPERAND (base, 0);
3299     }
3300   else
3301     base = build_fold_addr_expr (base);
3302
3303   if (off == NULL_TREE)
3304     off = size_zero_node;
3305
3306   /* If base is not loop invariant, either off is 0, then we start with just
3307      the constant offset in the loop invariant BASE and continue with base
3308      as OFF, otherwise give up.
3309      We could handle that case by gimplifying the addition of base + off
3310      into some SSA_NAME and use that as off, but for now punt.  */
3311   if (!expr_invariant_in_loop_p (loop, base))
3312     {
3313       if (!integer_zerop (off))
3314         return false;
3315       off = base;
3316       base = size_int (pbytepos);
3317     }
3318   /* Otherwise put base + constant offset into the loop invariant BASE
3319      and continue with OFF.  */
3320   else
3321     {
3322       base = fold_convert (sizetype, base);
3323       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3324     }
3325
3326   /* OFF at this point may be either a SSA_NAME or some tree expression
3327      from get_inner_reference.  Try to peel off loop invariants from it
3328      into BASE as long as possible.  */
3329   STRIP_NOPS (off);
3330   while (offtype == NULL_TREE)
3331     {
3332       enum tree_code code;
3333       tree op0, op1, add = NULL_TREE;
3334
3335       if (TREE_CODE (off) == SSA_NAME)
3336         {
3337           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3338
3339           if (expr_invariant_in_loop_p (loop, off))
3340             return false;
3341
3342           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3343             break;
3344
3345           op0 = gimple_assign_rhs1 (def_stmt);
3346           code = gimple_assign_rhs_code (def_stmt);
3347           op1 = gimple_assign_rhs2 (def_stmt);
3348         }
3349       else
3350         {
3351           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3352             return false;
3353           code = TREE_CODE (off);
3354           extract_ops_from_tree (off, &code, &op0, &op1);
3355         }
3356       switch (code)
3357         {
3358         case POINTER_PLUS_EXPR:
3359         case PLUS_EXPR:
3360           if (expr_invariant_in_loop_p (loop, op0))
3361             {
3362               add = op0;
3363               off = op1;
3364             do_add:
3365               add = fold_convert (sizetype, add);
3366               if (scale != 1)
3367                 add = size_binop (MULT_EXPR, add, size_int (scale));
3368               base = size_binop (PLUS_EXPR, base, add);
3369               continue;
3370             }
3371           if (expr_invariant_in_loop_p (loop, op1))
3372             {
3373               add = op1;
3374               off = op0;
3375               goto do_add;
3376             }
3377           break;
3378         case MINUS_EXPR:
3379           if (expr_invariant_in_loop_p (loop, op1))
3380             {
3381               add = fold_convert (sizetype, op1);
3382               add = size_binop (MINUS_EXPR, size_zero_node, add);
3383               off = op0;
3384               goto do_add;
3385             }
3386           break;
3387         case MULT_EXPR:
3388           if (scale == 1 && tree_fits_shwi_p (op1))
3389             {
3390               scale = tree_to_shwi (op1);
3391               off = op0;
3392               continue;
3393             }
3394           break;
3395         case SSA_NAME:
3396           off = op0;
3397           continue;
3398         CASE_CONVERT:
3399           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3400               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3401             break;
3402           if (TYPE_PRECISION (TREE_TYPE (op0))
3403               == TYPE_PRECISION (TREE_TYPE (off)))
3404             {
3405               off = op0;
3406               continue;
3407             }
3408           if (TYPE_PRECISION (TREE_TYPE (op0))
3409               < TYPE_PRECISION (TREE_TYPE (off)))
3410             {
3411               off = op0;
3412               offtype = TREE_TYPE (off);
3413               STRIP_NOPS (off);
3414               continue;
3415             }
3416           break;
3417         default:
3418           break;
3419         }
3420       break;
3421     }
3422
3423   /* If at the end OFF still isn't a SSA_NAME or isn't
3424      defined in the loop, punt.  */
3425   if (TREE_CODE (off) != SSA_NAME
3426       || expr_invariant_in_loop_p (loop, off))
3427     return false;
3428
3429   if (offtype == NULL_TREE)
3430     offtype = TREE_TYPE (off);
3431
3432   if (DR_IS_READ (dr))
3433     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3434                                              offtype, scale);
3435   else
3436     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3437                                               offtype, scale);
3438
3439   if (decl == NULL_TREE)
3440     return false;
3441
3442   info->decl = decl;
3443   info->base = base;
3444   info->offset = off;
3445   info->offset_dt = vect_unknown_def_type;
3446   info->offset_vectype = NULL_TREE;
3447   info->scale = scale;
3448   return true;
3449 }
3450
3451 /* Function vect_analyze_data_refs.
3452
3453   Find all the data references in the loop or basic block.
3454
3455    The general structure of the analysis of data refs in the vectorizer is as
3456    follows:
3457    1- vect_analyze_data_refs(loop/bb): call
3458       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3459       in the loop/bb and their dependences.
3460    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3461    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3462    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3463
3464 */
3465
3466 bool
3467 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3468 {
3469   struct loop *loop = NULL;
3470   unsigned int i;
3471   struct data_reference *dr;
3472   tree scalar_type;
3473
3474   if (dump_enabled_p ())
3475     dump_printf_loc (MSG_NOTE, vect_location,
3476                      "=== vect_analyze_data_refs ===\n");
3477
3478   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3479     loop = LOOP_VINFO_LOOP (loop_vinfo);
3480
3481   /* Go through the data-refs, check that the analysis succeeded.  Update
3482      pointer from stmt_vec_info struct to DR and vectype.  */
3483
3484   vec<data_reference_p> datarefs = vinfo->datarefs;
3485   FOR_EACH_VEC_ELT (datarefs, i, dr)
3486     {
3487       gimple *stmt;
3488       stmt_vec_info stmt_info;
3489       tree base, offset, init;
3490       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3491       bool simd_lane_access = false;
3492       int vf;
3493
3494 again:
3495       if (!dr || !DR_REF (dr))
3496         {
3497           if (dump_enabled_p ())
3498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3499                              "not vectorized: unhandled data-ref\n");
3500           return false;
3501         }
3502
3503       stmt = DR_STMT (dr);
3504       stmt_info = vinfo_for_stmt (stmt);
3505
3506       /* Discard clobbers from the dataref vector.  We will remove
3507          clobber stmts during vectorization.  */
3508       if (gimple_clobber_p (stmt))
3509         {
3510           free_data_ref (dr);
3511           if (i == datarefs.length () - 1)
3512             {
3513               datarefs.pop ();
3514               break;
3515             }
3516           datarefs.ordered_remove (i);
3517           dr = datarefs[i];
3518           goto again;
3519         }
3520
3521       /* Check that analysis of the data-ref succeeded.  */
3522       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3523           || !DR_STEP (dr))
3524         {
3525           bool maybe_gather
3526             = DR_IS_READ (dr)
3527               && !TREE_THIS_VOLATILE (DR_REF (dr))
3528               && targetm.vectorize.builtin_gather != NULL;
3529           bool maybe_scatter
3530             = DR_IS_WRITE (dr)
3531               && !TREE_THIS_VOLATILE (DR_REF (dr))
3532               && targetm.vectorize.builtin_scatter != NULL;
3533           bool maybe_simd_lane_access
3534             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3535
3536           /* If target supports vector gather loads or scatter stores, or if
3537              this might be a SIMD lane access, see if they can't be used.  */
3538           if (is_a <loop_vec_info> (vinfo)
3539               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3540               && !nested_in_vect_loop_p (loop, stmt))
3541             {
3542               struct data_reference *newdr
3543                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3544                                    DR_REF (dr), stmt, !maybe_scatter,
3545                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3546               gcc_assert (newdr != NULL && DR_REF (newdr));
3547               if (DR_BASE_ADDRESS (newdr)
3548                   && DR_OFFSET (newdr)
3549                   && DR_INIT (newdr)
3550                   && DR_STEP (newdr)
3551                   && integer_zerop (DR_STEP (newdr)))
3552                 {
3553                   if (maybe_simd_lane_access)
3554                     {
3555                       tree off = DR_OFFSET (newdr);
3556                       STRIP_NOPS (off);
3557                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3558                           && TREE_CODE (off) == MULT_EXPR
3559                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3560                         {
3561                           tree step = TREE_OPERAND (off, 1);
3562                           off = TREE_OPERAND (off, 0);
3563                           STRIP_NOPS (off);
3564                           if (CONVERT_EXPR_P (off)
3565                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3566                                                                           0)))
3567                                  < TYPE_PRECISION (TREE_TYPE (off)))
3568                             off = TREE_OPERAND (off, 0);
3569                           if (TREE_CODE (off) == SSA_NAME)
3570                             {
3571                               gimple *def = SSA_NAME_DEF_STMT (off);
3572                               tree reft = TREE_TYPE (DR_REF (newdr));
3573                               if (is_gimple_call (def)
3574                                   && gimple_call_internal_p (def)
3575                                   && (gimple_call_internal_fn (def)
3576                                       == IFN_GOMP_SIMD_LANE))
3577                                 {
3578                                   tree arg = gimple_call_arg (def, 0);
3579                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3580                                   arg = SSA_NAME_VAR (arg);
3581                                   if (arg == loop->simduid
3582                                       /* For now.  */
3583                                       && tree_int_cst_equal
3584                                            (TYPE_SIZE_UNIT (reft),
3585                                             step))
3586                                     {
3587                                       DR_OFFSET (newdr) = ssize_int (0);
3588                                       DR_STEP (newdr) = step;
3589                                       DR_OFFSET_ALIGNMENT (newdr)
3590                                         = BIGGEST_ALIGNMENT;
3591                                       DR_STEP_ALIGNMENT (newdr)
3592                                         = highest_pow2_factor (step);
3593                                       dr = newdr;
3594                                       simd_lane_access = true;
3595                                     }
3596                                 }
3597                             }
3598                         }
3599                     }
3600                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3601                     {
3602                       dr = newdr;
3603                       if (maybe_gather)
3604                         gatherscatter = GATHER;
3605                       else
3606                         gatherscatter = SCATTER;
3607                     }
3608                 }
3609               if (gatherscatter == SG_NONE && !simd_lane_access)
3610                 free_data_ref (newdr);
3611             }
3612
3613           if (gatherscatter == SG_NONE && !simd_lane_access)
3614             {
3615               if (dump_enabled_p ())
3616                 {
3617                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3618                                    "not vectorized: data ref analysis "
3619                                    "failed ");
3620                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3621                 }
3622
3623               if (is_a <bb_vec_info> (vinfo))
3624                 break;
3625
3626               return false;
3627             }
3628         }
3629
3630       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3631         {
3632           if (dump_enabled_p ())
3633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3634                              "not vectorized: base addr of dr is a "
3635                              "constant\n");
3636
3637           if (is_a <bb_vec_info> (vinfo))
3638             break;
3639
3640           if (gatherscatter != SG_NONE || simd_lane_access)
3641             free_data_ref (dr);
3642           return false;
3643         }
3644
3645       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3646         {
3647           if (dump_enabled_p ())
3648             {
3649               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650                                "not vectorized: volatile type ");
3651               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3652             }
3653
3654           if (is_a <bb_vec_info> (vinfo))
3655             break;
3656
3657           return false;
3658         }
3659
3660       if (stmt_can_throw_internal (stmt))
3661         {
3662           if (dump_enabled_p ())
3663             {
3664               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3665                                "not vectorized: statement can throw an "
3666                                "exception ");
3667               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3668             }
3669
3670           if (is_a <bb_vec_info> (vinfo))
3671             break;
3672
3673           if (gatherscatter != SG_NONE || simd_lane_access)
3674             free_data_ref (dr);
3675           return false;
3676         }
3677
3678       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3679           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3680         {
3681           if (dump_enabled_p ())
3682             {
3683               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3684                                "not vectorized: statement is bitfield "
3685                                "access ");
3686               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3687             }
3688
3689           if (is_a <bb_vec_info> (vinfo))
3690             break;
3691
3692           if (gatherscatter != SG_NONE || simd_lane_access)
3693             free_data_ref (dr);
3694           return false;
3695         }
3696
3697       base = unshare_expr (DR_BASE_ADDRESS (dr));
3698       offset = unshare_expr (DR_OFFSET (dr));
3699       init = unshare_expr (DR_INIT (dr));
3700
3701       if (is_gimple_call (stmt)
3702           && (!gimple_call_internal_p (stmt)
3703               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3704                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3705         {
3706           if (dump_enabled_p ())
3707             {
3708               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3709                                "not vectorized: dr in a call ");
3710               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3711             }
3712
3713           if (is_a <bb_vec_info> (vinfo))
3714             break;
3715
3716           if (gatherscatter != SG_NONE || simd_lane_access)
3717             free_data_ref (dr);
3718           return false;
3719         }
3720
3721       /* Update DR field in stmt_vec_info struct.  */
3722
3723       /* If the dataref is in an inner-loop of the loop that is considered for
3724          for vectorization, we also want to analyze the access relative to
3725          the outer-loop (DR contains information only relative to the
3726          inner-most enclosing loop).  We do that by building a reference to the
3727          first location accessed by the inner-loop, and analyze it relative to
3728          the outer-loop.  */
3729       if (loop && nested_in_vect_loop_p (loop, stmt))
3730         {
3731           /* Build a reference to the first location accessed by the
3732              inner loop: *(BASE + INIT + OFFSET).  By construction,
3733              this address must be invariant in the inner loop, so we
3734              can consider it as being used in the outer loop.  */
3735           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3736                                           init, offset);
3737           tree init_addr = fold_build_pointer_plus (base, init_offset);
3738           tree init_ref = build_fold_indirect_ref (init_addr);
3739
3740           if (dump_enabled_p ())
3741             {
3742               dump_printf_loc (MSG_NOTE, vect_location,
3743                                "analyze in outer loop: ");
3744               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3745               dump_printf (MSG_NOTE, "\n");
3746             }
3747
3748           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3749                                      init_ref, loop))
3750             /* dr_analyze_innermost already explained the failure.  */
3751             return false;
3752
3753           if (dump_enabled_p ())
3754             {
3755               dump_printf_loc (MSG_NOTE, vect_location,
3756                                "\touter base_address: ");
3757               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3758                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3759               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3760               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3761                                  STMT_VINFO_DR_OFFSET (stmt_info));
3762               dump_printf (MSG_NOTE,
3763                            "\n\touter constant offset from base address: ");
3764               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3765                                  STMT_VINFO_DR_INIT (stmt_info));
3766               dump_printf (MSG_NOTE, "\n\touter step: ");
3767               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768                                  STMT_VINFO_DR_STEP (stmt_info));
3769               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3770                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3771               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3772                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3773               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3774                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3775               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3776                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3777             }
3778         }
3779
3780       if (STMT_VINFO_DATA_REF (stmt_info))
3781         {
3782           if (dump_enabled_p ())
3783             {
3784               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3785                                "not vectorized: more than one data ref "
3786                                "in stmt: ");
3787               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3788             }
3789
3790           if (is_a <bb_vec_info> (vinfo))
3791             break;
3792
3793           if (gatherscatter != SG_NONE || simd_lane_access)
3794             free_data_ref (dr);
3795           return false;
3796         }
3797
3798       STMT_VINFO_DATA_REF (stmt_info) = dr;
3799       if (simd_lane_access)
3800         {
3801           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3802           free_data_ref (datarefs[i]);
3803           datarefs[i] = dr;
3804         }
3805
3806       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3807           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3808           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3809         {
3810           if (dump_enabled_p ())
3811             {
3812               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813                                "not vectorized: base object not addressable "
3814                                "for stmt: ");
3815               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3816             }
3817           if (is_a <bb_vec_info> (vinfo))
3818             {
3819               /* In BB vectorization the ref can still participate
3820                  in dependence analysis, we just can't vectorize it.  */
3821               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3822               continue;
3823             }
3824           return false;
3825         }
3826
3827       /* Set vectype for STMT.  */
3828       scalar_type = TREE_TYPE (DR_REF (dr));
3829       STMT_VINFO_VECTYPE (stmt_info)
3830         = get_vectype_for_scalar_type (scalar_type);
3831       if (!STMT_VINFO_VECTYPE (stmt_info))
3832         {
3833           if (dump_enabled_p ())
3834             {
3835               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3836                                "not vectorized: no vectype for stmt: ");
3837               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3838               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3839               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3840                                  scalar_type);
3841               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3842             }
3843
3844           if (is_a <bb_vec_info> (vinfo))
3845             {
3846               /* No vector type is fine, the ref can still participate
3847                  in dependence analysis, we just can't vectorize it.  */
3848               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3849               continue;
3850             }
3851
3852           if (gatherscatter != SG_NONE || simd_lane_access)
3853             {
3854               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3855               if (gatherscatter != SG_NONE)
3856                 free_data_ref (dr);
3857             }
3858           return false;
3859         }
3860       else
3861         {
3862           if (dump_enabled_p ())
3863             {
3864               dump_printf_loc (MSG_NOTE, vect_location,
3865                                "got vectype for stmt: ");
3866               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3867               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3868                                  STMT_VINFO_VECTYPE (stmt_info));
3869               dump_printf (MSG_NOTE, "\n");
3870             }
3871         }
3872
3873       /* Adjust the minimal vectorization factor according to the
3874          vector type.  */
3875       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3876       if (vf > *min_vf)
3877         *min_vf = vf;
3878
3879       if (gatherscatter != SG_NONE)
3880         {
3881           gather_scatter_info gs_info;
3882           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3883                                           &gs_info)
3884               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3885             {
3886               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3887               free_data_ref (dr);
3888               if (dump_enabled_p ())
3889                 {
3890                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3891                                    (gatherscatter == GATHER) ?
3892                                    "not vectorized: not suitable for gather "
3893                                    "load " :
3894                                    "not vectorized: not suitable for scatter "
3895                                    "store ");
3896                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3897                 }
3898               return false;
3899             }
3900
3901           free_data_ref (datarefs[i]);
3902           datarefs[i] = dr;
3903           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3904         }
3905
3906       else if (is_a <loop_vec_info> (vinfo)
3907                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3908         {
3909           if (nested_in_vect_loop_p (loop, stmt))
3910             {
3911               if (dump_enabled_p ())
3912                 {
3913                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3914                                    "not vectorized: not suitable for strided "
3915                                    "load ");
3916                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3917                 }
3918               return false;
3919             }
3920           STMT_VINFO_STRIDED_P (stmt_info) = true;
3921         }
3922     }
3923
3924   /* If we stopped analysis at the first dataref we could not analyze
3925      when trying to vectorize a basic-block mark the rest of the datarefs
3926      as not vectorizable and truncate the vector of datarefs.  That
3927      avoids spending useless time in analyzing their dependence.  */
3928   if (i != datarefs.length ())
3929     {
3930       gcc_assert (is_a <bb_vec_info> (vinfo));
3931       for (unsigned j = i; j < datarefs.length (); ++j)
3932         {
3933           data_reference_p dr = datarefs[j];
3934           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3935           free_data_ref (dr);
3936         }
3937       datarefs.truncate (i);
3938     }
3939
3940   return true;
3941 }
3942
3943
3944 /* Function vect_get_new_vect_var.
3945
3946    Returns a name for a new variable.  The current naming scheme appends the
3947    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3948    the name of vectorizer generated variables, and appends that to NAME if
3949    provided.  */
3950
3951 tree
3952 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3953 {
3954   const char *prefix;
3955   tree new_vect_var;
3956
3957   switch (var_kind)
3958   {
3959   case vect_simple_var:
3960     prefix = "vect";
3961     break;
3962   case vect_scalar_var:
3963     prefix = "stmp";
3964     break;
3965   case vect_mask_var:
3966     prefix = "mask";
3967     break;
3968   case vect_pointer_var:
3969     prefix = "vectp";
3970     break;
3971   default:
3972     gcc_unreachable ();
3973   }
3974
3975   if (name)
3976     {
3977       char* tmp = concat (prefix, "_", name, NULL);
3978       new_vect_var = create_tmp_reg (type, tmp);
3979       free (tmp);
3980     }
3981   else
3982     new_vect_var = create_tmp_reg (type, prefix);
3983
3984   return new_vect_var;
3985 }
3986
3987 /* Like vect_get_new_vect_var but return an SSA name.  */
3988
3989 tree
3990 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3991 {
3992   const char *prefix;
3993   tree new_vect_var;
3994
3995   switch (var_kind)
3996   {
3997   case vect_simple_var:
3998     prefix = "vect";
3999     break;
4000   case vect_scalar_var:
4001     prefix = "stmp";
4002     break;
4003   case vect_pointer_var:
4004     prefix = "vectp";
4005     break;
4006   default:
4007     gcc_unreachable ();
4008   }
4009
4010   if (name)
4011     {
4012       char* tmp = concat (prefix, "_", name, NULL);
4013       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4014       free (tmp);
4015     }
4016   else
4017     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4018
4019   return new_vect_var;
4020 }
4021
4022 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4023
4024 static void
4025 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4026 {
4027   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4028   int misalign = DR_MISALIGNMENT (dr);
4029   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4030     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4031   else
4032     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4033                             DR_TARGET_ALIGNMENT (dr), misalign);
4034 }
4035
4036 /* Function vect_create_addr_base_for_vector_ref.
4037
4038    Create an expression that computes the address of the first memory location
4039    that will be accessed for a data reference.
4040
4041    Input:
4042    STMT: The statement containing the data reference.
4043    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4044    OFFSET: Optional. If supplied, it is be added to the initial address.
4045    LOOP:    Specify relative to which loop-nest should the address be computed.
4046             For example, when the dataref is in an inner-loop nested in an
4047             outer-loop that is now being vectorized, LOOP can be either the
4048             outer-loop, or the inner-loop.  The first memory location accessed
4049             by the following dataref ('in' points to short):
4050
4051                 for (i=0; i<N; i++)
4052                    for (j=0; j<M; j++)
4053                      s += in[i+j]
4054
4055             is as follows:
4056             if LOOP=i_loop:     &in             (relative to i_loop)
4057             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4058    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4059             initial address.  Unlike OFFSET, which is number of elements to
4060             be added, BYTE_OFFSET is measured in bytes.
4061
4062    Output:
4063    1. Return an SSA_NAME whose value is the address of the memory location of
4064       the first vector of the data reference.
4065    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4066       these statement(s) which define the returned SSA_NAME.
4067
4068    FORNOW: We are only handling array accesses with step 1.  */
4069
4070 tree
4071 vect_create_addr_base_for_vector_ref (gimple *stmt,
4072                                       gimple_seq *new_stmt_list,
4073                                       tree offset,
4074                                       tree byte_offset)
4075 {
4076   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4077   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4078   const char *base_name;
4079   tree addr_base;
4080   tree dest;
4081   gimple_seq seq = NULL;
4082   tree vect_ptr_type;
4083   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4084   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4085   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4086
4087   tree data_ref_base = unshare_expr (drb->base_address);
4088   tree base_offset = unshare_expr (drb->offset);
4089   tree init = unshare_expr (drb->init);
4090
4091   if (loop_vinfo)
4092     base_name = get_name (data_ref_base);
4093   else
4094     {
4095       base_offset = ssize_int (0);
4096       init = ssize_int (0);
4097       base_name = get_name (DR_REF (dr));
4098     }
4099
4100   /* Create base_offset */
4101   base_offset = size_binop (PLUS_EXPR,
4102                             fold_convert (sizetype, base_offset),
4103                             fold_convert (sizetype, init));
4104
4105   if (offset)
4106     {
4107       offset = fold_build2 (MULT_EXPR, sizetype,
4108                             fold_convert (sizetype, offset), step);
4109       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4110                                  base_offset, offset);
4111     }
4112   if (byte_offset)
4113     {
4114       byte_offset = fold_convert (sizetype, byte_offset);
4115       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4116                                  base_offset, byte_offset);
4117     }
4118
4119   /* base + base_offset */
4120   if (loop_vinfo)
4121     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4122   else
4123     {
4124       addr_base = build1 (ADDR_EXPR,
4125                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4126                           unshare_expr (DR_REF (dr)));
4127     }
4128
4129   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4130   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4131   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4132   gimple_seq_add_seq (new_stmt_list, seq);
4133
4134   if (DR_PTR_INFO (dr)
4135       && TREE_CODE (addr_base) == SSA_NAME
4136       && !SSA_NAME_PTR_INFO (addr_base))
4137     {
4138       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4139       if (offset || byte_offset)
4140         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4141     }
4142
4143   if (dump_enabled_p ())
4144     {
4145       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4146       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4147       dump_printf (MSG_NOTE, "\n");
4148     }
4149
4150   return addr_base;
4151 }
4152
4153
4154 /* Function vect_create_data_ref_ptr.
4155
4156    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4157    location accessed in the loop by STMT, along with the def-use update
4158    chain to appropriately advance the pointer through the loop iterations.
4159    Also set aliasing information for the pointer.  This pointer is used by
4160    the callers to this function to create a memory reference expression for
4161    vector load/store access.
4162
4163    Input:
4164    1. STMT: a stmt that references memory. Expected to be of the form
4165          GIMPLE_ASSIGN <name, data-ref> or
4166          GIMPLE_ASSIGN <data-ref, name>.
4167    2. AGGR_TYPE: the type of the reference, which should be either a vector
4168         or an array.
4169    3. AT_LOOP: the loop where the vector memref is to be created.
4170    4. OFFSET (optional): an offset to be added to the initial address accessed
4171         by the data-ref in STMT.
4172    5. BSI: location where the new stmts are to be placed if there is no loop
4173    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4174         pointing to the initial address.
4175    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4176         to the initial address accessed by the data-ref in STMT.  This is
4177         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4178         in bytes.
4179
4180    Output:
4181    1. Declare a new ptr to vector_type, and have it point to the base of the
4182       data reference (initial addressed accessed by the data reference).
4183       For example, for vector of type V8HI, the following code is generated:
4184
4185       v8hi *ap;
4186       ap = (v8hi *)initial_address;
4187
4188       if OFFSET is not supplied:
4189          initial_address = &a[init];
4190       if OFFSET is supplied:
4191          initial_address = &a[init + OFFSET];
4192       if BYTE_OFFSET is supplied:
4193          initial_address = &a[init] + BYTE_OFFSET;
4194
4195       Return the initial_address in INITIAL_ADDRESS.
4196
4197    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4198       update the pointer in each iteration of the loop.
4199
4200       Return the increment stmt that updates the pointer in PTR_INCR.
4201
4202    3. Set INV_P to true if the access pattern of the data reference in the
4203       vectorized loop is invariant.  Set it to false otherwise.
4204
4205    4. Return the pointer.  */
4206
4207 tree
4208 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4209                           tree offset, tree *initial_address,
4210                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4211                           bool only_init, bool *inv_p, tree byte_offset)
4212 {
4213   const char *base_name;
4214   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4215   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4216   struct loop *loop = NULL;
4217   bool nested_in_vect_loop = false;
4218   struct loop *containing_loop = NULL;
4219   tree aggr_ptr_type;
4220   tree aggr_ptr;
4221   tree new_temp;
4222   gimple_seq new_stmt_list = NULL;
4223   edge pe = NULL;
4224   basic_block new_bb;
4225   tree aggr_ptr_init;
4226   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4227   tree aptr;
4228   gimple_stmt_iterator incr_gsi;
4229   bool insert_after;
4230   tree indx_before_incr, indx_after_incr;
4231   gimple *incr;
4232   tree step;
4233   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4234
4235   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4236               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4237
4238   if (loop_vinfo)
4239     {
4240       loop = LOOP_VINFO_LOOP (loop_vinfo);
4241       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4242       containing_loop = (gimple_bb (stmt))->loop_father;
4243       pe = loop_preheader_edge (loop);
4244     }
4245   else
4246     {
4247       gcc_assert (bb_vinfo);
4248       only_init = true;
4249       *ptr_incr = NULL;
4250     }
4251
4252   /* Check the step (evolution) of the load in LOOP, and record
4253      whether it's invariant.  */
4254   step = vect_dr_behavior (dr)->step;
4255   if (integer_zerop (step))
4256     *inv_p = true;
4257   else
4258     *inv_p = false;
4259
4260   /* Create an expression for the first address accessed by this load
4261      in LOOP.  */
4262   base_name = get_name (DR_BASE_ADDRESS (dr));
4263
4264   if (dump_enabled_p ())
4265     {
4266       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4267       dump_printf_loc (MSG_NOTE, vect_location,
4268                        "create %s-pointer variable to type: ",
4269                        get_tree_code_name (TREE_CODE (aggr_type)));
4270       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4271       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4272         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4273       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4274         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4275       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4276         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4277       else
4278         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4279       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4280       dump_printf (MSG_NOTE, "\n");
4281     }
4282
4283   /* (1) Create the new aggregate-pointer variable.
4284      Vector and array types inherit the alias set of their component
4285      type by default so we need to use a ref-all pointer if the data
4286      reference does not conflict with the created aggregated data
4287      reference because it is not addressable.  */
4288   bool need_ref_all = false;
4289   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4290                               get_alias_set (DR_REF (dr))))
4291     need_ref_all = true;
4292   /* Likewise for any of the data references in the stmt group.  */
4293   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4294     {
4295       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4296       do
4297         {
4298           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4299           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4300           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4301                                       get_alias_set (DR_REF (sdr))))
4302             {
4303               need_ref_all = true;
4304               break;
4305             }
4306           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4307         }
4308       while (orig_stmt);
4309     }
4310   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4311                                                need_ref_all);
4312   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4313
4314
4315   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4316      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4317      def-use update cycles for the pointer: one relative to the outer-loop
4318      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4319      to the inner-loop (which is the inner-most loop containing the dataref),
4320      and this is done be step (5) below.
4321
4322      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4323      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4324      redundant.  Steps (3),(4) create the following:
4325
4326         vp0 = &base_addr;
4327         LOOP:   vp1 = phi(vp0,vp2)
4328                 ...
4329                 ...
4330                 vp2 = vp1 + step
4331                 goto LOOP
4332
4333      If there is an inner-loop nested in loop, then step (5) will also be
4334      applied, and an additional update in the inner-loop will be created:
4335
4336         vp0 = &base_addr;
4337         LOOP:   vp1 = phi(vp0,vp2)
4338                 ...
4339         inner:     vp3 = phi(vp1,vp4)
4340                    vp4 = vp3 + inner_step
4341                    if () goto inner
4342                 ...
4343                 vp2 = vp1 + step
4344                 if () goto LOOP   */
4345
4346   /* (2) Calculate the initial address of the aggregate-pointer, and set
4347      the aggregate-pointer to point to it before the loop.  */
4348
4349   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4350
4351   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4352                                                    offset, byte_offset);
4353   if (new_stmt_list)
4354     {
4355       if (pe)
4356         {
4357           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4358           gcc_assert (!new_bb);
4359         }
4360       else
4361         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4362     }
4363
4364   *initial_address = new_temp;
4365   aggr_ptr_init = new_temp;
4366
4367   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4368      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4369      inner-loop nested in LOOP (during outer-loop vectorization).  */
4370
4371   /* No update in loop is required.  */
4372   if (only_init && (!loop_vinfo || at_loop == loop))
4373     aptr = aggr_ptr_init;
4374   else
4375     {
4376       /* The step of the aggregate pointer is the type size.  */
4377       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4378       /* One exception to the above is when the scalar step of the load in
4379          LOOP is zero. In this case the step here is also zero.  */
4380       if (*inv_p)
4381         iv_step = size_zero_node;
4382       else if (tree_int_cst_sgn (step) == -1)
4383         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4384
4385       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4386
4387       create_iv (aggr_ptr_init,
4388                  fold_convert (aggr_ptr_type, iv_step),
4389                  aggr_ptr, loop, &incr_gsi, insert_after,
4390                  &indx_before_incr, &indx_after_incr);
4391       incr = gsi_stmt (incr_gsi);
4392       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4393
4394       /* Copy the points-to information if it exists. */
4395       if (DR_PTR_INFO (dr))
4396         {
4397           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4398           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4399         }
4400       if (ptr_incr)
4401         *ptr_incr = incr;
4402
4403       aptr = indx_before_incr;
4404     }
4405
4406   if (!nested_in_vect_loop || only_init)
4407     return aptr;
4408
4409
4410   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4411      nested in LOOP, if exists.  */
4412
4413   gcc_assert (nested_in_vect_loop);
4414   if (!only_init)
4415     {
4416       standard_iv_increment_position (containing_loop, &incr_gsi,
4417                                       &insert_after);
4418       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4419                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4420                  &indx_after_incr);
4421       incr = gsi_stmt (incr_gsi);
4422       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4423
4424       /* Copy the points-to information if it exists. */
4425       if (DR_PTR_INFO (dr))
4426         {
4427           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4428           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4429         }
4430       if (ptr_incr)
4431         *ptr_incr = incr;
4432
4433       return indx_before_incr;
4434     }
4435   else
4436     gcc_unreachable ();
4437 }
4438
4439
4440 /* Function bump_vector_ptr
4441
4442    Increment a pointer (to a vector type) by vector-size. If requested,
4443    i.e. if PTR-INCR is given, then also connect the new increment stmt
4444    to the existing def-use update-chain of the pointer, by modifying
4445    the PTR_INCR as illustrated below:
4446
4447    The pointer def-use update-chain before this function:
4448                         DATAREF_PTR = phi (p_0, p_2)
4449                         ....
4450         PTR_INCR:       p_2 = DATAREF_PTR + step
4451
4452    The pointer def-use update-chain after this function:
4453                         DATAREF_PTR = phi (p_0, p_2)
4454                         ....
4455                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4456                         ....
4457         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4458
4459    Input:
4460    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4461                  in the loop.
4462    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4463               the loop.  The increment amount across iterations is expected
4464               to be vector_size.
4465    BSI - location where the new update stmt is to be placed.
4466    STMT - the original scalar memory-access stmt that is being vectorized.
4467    BUMP - optional. The offset by which to bump the pointer. If not given,
4468           the offset is assumed to be vector_size.
4469
4470    Output: Return NEW_DATAREF_PTR as illustrated above.
4471
4472 */
4473
4474 tree
4475 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4476                  gimple *stmt, tree bump)
4477 {
4478   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4479   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4480   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4481   tree update = TYPE_SIZE_UNIT (vectype);
4482   gassign *incr_stmt;
4483   ssa_op_iter iter;
4484   use_operand_p use_p;
4485   tree new_dataref_ptr;
4486
4487   if (bump)
4488     update = bump;
4489
4490   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4491     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4492   else
4493     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4494   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4495                                    dataref_ptr, update);
4496   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4497
4498   /* Copy the points-to information if it exists. */
4499   if (DR_PTR_INFO (dr))
4500     {
4501       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4502       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4503     }
4504
4505   if (!ptr_incr)
4506     return new_dataref_ptr;
4507
4508   /* Update the vector-pointer's cross-iteration increment.  */
4509   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4510     {
4511       tree use = USE_FROM_PTR (use_p);
4512
4513       if (use == dataref_ptr)
4514         SET_USE (use_p, new_dataref_ptr);
4515       else
4516         gcc_assert (tree_int_cst_compare (use, update) == 0);
4517     }
4518
4519   return new_dataref_ptr;
4520 }
4521
4522
4523 /* Function vect_create_destination_var.
4524
4525    Create a new temporary of type VECTYPE.  */
4526
4527 tree
4528 vect_create_destination_var (tree scalar_dest, tree vectype)
4529 {
4530   tree vec_dest;
4531   const char *name;
4532   char *new_name;
4533   tree type;
4534   enum vect_var_kind kind;
4535
4536   kind = vectype
4537     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4538     ? vect_mask_var
4539     : vect_simple_var
4540     : vect_scalar_var;
4541   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4542
4543   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4544
4545   name = get_name (scalar_dest);
4546   if (name)
4547     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4548   else
4549     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4550   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4551   free (new_name);
4552
4553   return vec_dest;
4554 }
4555
4556 /* Function vect_grouped_store_supported.
4557
4558    Returns TRUE if interleave high and interleave low permutations
4559    are supported, and FALSE otherwise.  */
4560
4561 bool
4562 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4563 {
4564   machine_mode mode = TYPE_MODE (vectype);
4565
4566   /* vect_permute_store_chain requires the group size to be equal to 3 or
4567      be a power of two.  */
4568   if (count != 3 && exact_log2 (count) == -1)
4569     {
4570       if (dump_enabled_p ())
4571         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4572                          "the size of the group of accesses"
4573                          " is not a power of 2 or not eqaul to 3\n");
4574       return false;
4575     }
4576
4577   /* Check that the permutation is supported.  */
4578   if (VECTOR_MODE_P (mode))
4579     {
4580       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4581       auto_vec_perm_indices sel (nelt);
4582       sel.quick_grow (nelt);
4583
4584       if (count == 3)
4585         {
4586           unsigned int j0 = 0, j1 = 0, j2 = 0;
4587           unsigned int i, j;
4588
4589           for (j = 0; j < 3; j++)
4590             {
4591               int nelt0 = ((3 - j) * nelt) % 3;
4592               int nelt1 = ((3 - j) * nelt + 1) % 3;
4593               int nelt2 = ((3 - j) * nelt + 2) % 3;
4594               for (i = 0; i < nelt; i++)
4595                 {
4596                   if (3 * i + nelt0 < nelt)
4597                     sel[3 * i + nelt0] = j0++;
4598                   if (3 * i + nelt1 < nelt)
4599                     sel[3 * i + nelt1] = nelt + j1++;
4600                   if (3 * i + nelt2 < nelt)
4601                     sel[3 * i + nelt2] = 0;
4602                 }
4603               if (!can_vec_perm_p (mode, false, &sel))
4604                 {
4605                   if (dump_enabled_p ())
4606                     dump_printf (MSG_MISSED_OPTIMIZATION,
4607                                  "permutaion op not supported by target.\n");
4608                   return false;
4609                 }
4610
4611               for (i = 0; i < nelt; i++)
4612                 {
4613                   if (3 * i + nelt0 < nelt)
4614                     sel[3 * i + nelt0] = 3 * i + nelt0;
4615                   if (3 * i + nelt1 < nelt)
4616                     sel[3 * i + nelt1] = 3 * i + nelt1;
4617                   if (3 * i + nelt2 < nelt)
4618                     sel[3 * i + nelt2] = nelt + j2++;
4619                 }
4620               if (!can_vec_perm_p (mode, false, &sel))
4621                 {
4622                   if (dump_enabled_p ())
4623                     dump_printf (MSG_MISSED_OPTIMIZATION,
4624                                  "permutaion op not supported by target.\n");
4625                   return false;
4626                 }
4627             }
4628           return true;
4629         }
4630       else
4631         {
4632           /* If length is not equal to 3 then only power of 2 is supported.  */
4633           gcc_assert (pow2p_hwi (count));
4634
4635           for (i = 0; i < nelt / 2; i++)
4636             {
4637               sel[i * 2] = i;
4638               sel[i * 2 + 1] = i + nelt;
4639             }
4640           if (can_vec_perm_p (mode, false, &sel))
4641             {
4642               for (i = 0; i < nelt; i++)
4643                 sel[i] += nelt / 2;
4644               if (can_vec_perm_p (mode, false, &sel))
4645                 return true;
4646             }
4647         }
4648     }
4649
4650   if (dump_enabled_p ())
4651     dump_printf (MSG_MISSED_OPTIMIZATION,
4652                  "permutaion op not supported by target.\n");
4653   return false;
4654 }
4655
4656
4657 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4658    type VECTYPE.  */
4659
4660 bool
4661 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4662 {
4663   return vect_lanes_optab_supported_p ("vec_store_lanes",
4664                                        vec_store_lanes_optab,
4665                                        vectype, count);
4666 }
4667
4668
4669 /* Function vect_permute_store_chain.
4670
4671    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4672    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4673    the data correctly for the stores.  Return the final references for stores
4674    in RESULT_CHAIN.
4675
4676    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4677    The input is 4 vectors each containing 8 elements.  We assign a number to
4678    each element, the input sequence is:
4679
4680    1st vec:   0  1  2  3  4  5  6  7
4681    2nd vec:   8  9 10 11 12 13 14 15
4682    3rd vec:  16 17 18 19 20 21 22 23
4683    4th vec:  24 25 26 27 28 29 30 31
4684
4685    The output sequence should be:
4686
4687    1st vec:  0  8 16 24  1  9 17 25
4688    2nd vec:  2 10 18 26  3 11 19 27
4689    3rd vec:  4 12 20 28  5 13 21 30
4690    4th vec:  6 14 22 30  7 15 23 31
4691
4692    i.e., we interleave the contents of the four vectors in their order.
4693
4694    We use interleave_high/low instructions to create such output.  The input of
4695    each interleave_high/low operation is two vectors:
4696    1st vec    2nd vec
4697    0 1 2 3    4 5 6 7
4698    the even elements of the result vector are obtained left-to-right from the
4699    high/low elements of the first vector.  The odd elements of the result are
4700    obtained left-to-right from the high/low elements of the second vector.
4701    The output of interleave_high will be:   0 4 1 5
4702    and of interleave_low:                   2 6 3 7
4703
4704
4705    The permutation is done in log LENGTH stages.  In each stage interleave_high
4706    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4707    where the first argument is taken from the first half of DR_CHAIN and the
4708    second argument from it's second half.
4709    In our example,
4710
4711    I1: interleave_high (1st vec, 3rd vec)
4712    I2: interleave_low (1st vec, 3rd vec)
4713    I3: interleave_high (2nd vec, 4th vec)
4714    I4: interleave_low (2nd vec, 4th vec)
4715
4716    The output for the first stage is:
4717
4718    I1:  0 16  1 17  2 18  3 19
4719    I2:  4 20  5 21  6 22  7 23
4720    I3:  8 24  9 25 10 26 11 27
4721    I4: 12 28 13 29 14 30 15 31
4722
4723    The output of the second stage, i.e. the final result is:
4724
4725    I1:  0  8 16 24  1  9 17 25
4726    I2:  2 10 18 26  3 11 19 27
4727    I3:  4 12 20 28  5 13 21 30
4728    I4:  6 14 22 30  7 15 23 31.  */
4729
4730 void
4731 vect_permute_store_chain (vec<tree> dr_chain,
4732                           unsigned int length,
4733                           gimple *stmt,
4734                           gimple_stmt_iterator *gsi,
4735                           vec<tree> *result_chain)
4736 {
4737   tree vect1, vect2, high, low;
4738   gimple *perm_stmt;
4739   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4740   tree perm_mask_low, perm_mask_high;
4741   tree data_ref;
4742   tree perm3_mask_low, perm3_mask_high;
4743   unsigned int i, n, log_length = exact_log2 (length);
4744   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4745
4746   auto_vec_perm_indices sel (nelt);
4747   sel.quick_grow (nelt);
4748
4749   result_chain->quick_grow (length);
4750   memcpy (result_chain->address (), dr_chain.address (),
4751           length * sizeof (tree));
4752
4753   if (length == 3)
4754     {
4755       unsigned int j0 = 0, j1 = 0, j2 = 0;
4756
4757       for (j = 0; j < 3; j++)
4758         {
4759           int nelt0 = ((3 - j) * nelt) % 3;
4760           int nelt1 = ((3 - j) * nelt + 1) % 3;
4761           int nelt2 = ((3 - j) * nelt + 2) % 3;
4762
4763           for (i = 0; i < nelt; i++)
4764             {
4765               if (3 * i + nelt0 < nelt)
4766                 sel[3 * i + nelt0] = j0++;
4767               if (3 * i + nelt1 < nelt)
4768                 sel[3 * i + nelt1] = nelt + j1++;
4769               if (3 * i + nelt2 < nelt)
4770                 sel[3 * i + nelt2] = 0;
4771             }
4772           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4773
4774           for (i = 0; i < nelt; i++)
4775             {
4776               if (3 * i + nelt0 < nelt)
4777                 sel[3 * i + nelt0] = 3 * i + nelt0;
4778               if (3 * i + nelt1 < nelt)
4779                 sel[3 * i + nelt1] = 3 * i + nelt1;
4780               if (3 * i + nelt2 < nelt)
4781                 sel[3 * i + nelt2] = nelt + j2++;
4782             }
4783           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4784
4785           vect1 = dr_chain[0];
4786           vect2 = dr_chain[1];
4787
4788           /* Create interleaving stmt:
4789              low = VEC_PERM_EXPR <vect1, vect2,
4790                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4791                                    j + 2, nelt + j + 2, *, ...}>  */
4792           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4793           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4794                                            vect2, perm3_mask_low);
4795           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4796
4797           vect1 = data_ref;
4798           vect2 = dr_chain[2];
4799           /* Create interleaving stmt:
4800              low = VEC_PERM_EXPR <vect1, vect2,
4801                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4802                                    6, 7, nelt + j + 2, ...}>  */
4803           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4804           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4805                                            vect2, perm3_mask_high);
4806           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4807           (*result_chain)[j] = data_ref;
4808         }
4809     }
4810   else
4811     {
4812       /* If length is not equal to 3 then only power of 2 is supported.  */
4813       gcc_assert (pow2p_hwi (length));
4814
4815       for (i = 0, n = nelt / 2; i < n; i++)
4816         {
4817           sel[i * 2] = i;
4818           sel[i * 2 + 1] = i + nelt;
4819         }
4820         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4821
4822         for (i = 0; i < nelt; i++)
4823           sel[i] += nelt / 2;
4824         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4825
4826         for (i = 0, n = log_length; i < n; i++)
4827           {
4828             for (j = 0; j < length/2; j++)
4829               {
4830                 vect1 = dr_chain[j];
4831                 vect2 = dr_chain[j+length/2];
4832
4833                 /* Create interleaving stmt:
4834                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4835                                                         ...}>  */
4836                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4837                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4838                                                  vect2, perm_mask_high);
4839                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4840                 (*result_chain)[2*j] = high;
4841
4842                 /* Create interleaving stmt:
4843                    low = VEC_PERM_EXPR <vect1, vect2,
4844                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4845                                          ...}>  */
4846                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4847                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4848                                                  vect2, perm_mask_low);
4849                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4850                 (*result_chain)[2*j+1] = low;
4851               }
4852             memcpy (dr_chain.address (), result_chain->address (),
4853                     length * sizeof (tree));
4854           }
4855     }
4856 }
4857
4858 /* Function vect_setup_realignment
4859
4860    This function is called when vectorizing an unaligned load using
4861    the dr_explicit_realign[_optimized] scheme.
4862    This function generates the following code at the loop prolog:
4863
4864       p = initial_addr;
4865    x  msq_init = *(floor(p));   # prolog load
4866       realignment_token = call target_builtin;
4867     loop:
4868    x  msq = phi (msq_init, ---)
4869
4870    The stmts marked with x are generated only for the case of
4871    dr_explicit_realign_optimized.
4872
4873    The code above sets up a new (vector) pointer, pointing to the first
4874    location accessed by STMT, and a "floor-aligned" load using that pointer.
4875    It also generates code to compute the "realignment-token" (if the relevant
4876    target hook was defined), and creates a phi-node at the loop-header bb
4877    whose arguments are the result of the prolog-load (created by this
4878    function) and the result of a load that takes place in the loop (to be
4879    created by the caller to this function).
4880
4881    For the case of dr_explicit_realign_optimized:
4882    The caller to this function uses the phi-result (msq) to create the
4883    realignment code inside the loop, and sets up the missing phi argument,
4884    as follows:
4885     loop:
4886       msq = phi (msq_init, lsq)
4887       lsq = *(floor(p'));        # load in loop
4888       result = realign_load (msq, lsq, realignment_token);
4889
4890    For the case of dr_explicit_realign:
4891     loop:
4892       msq = *(floor(p));        # load in loop
4893       p' = p + (VS-1);
4894       lsq = *(floor(p'));       # load in loop
4895       result = realign_load (msq, lsq, realignment_token);
4896
4897    Input:
4898    STMT - (scalar) load stmt to be vectorized. This load accesses
4899           a memory location that may be unaligned.
4900    BSI - place where new code is to be inserted.
4901    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4902                               is used.
4903
4904    Output:
4905    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4906                        target hook, if defined.
4907    Return value - the result of the loop-header phi node.  */
4908
4909 tree
4910 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4911                         tree *realignment_token,
4912                         enum dr_alignment_support alignment_support_scheme,
4913                         tree init_addr,
4914                         struct loop **at_loop)
4915 {
4916   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4917   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4918   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4919   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4920   struct loop *loop = NULL;
4921   edge pe = NULL;
4922   tree scalar_dest = gimple_assign_lhs (stmt);
4923   tree vec_dest;
4924   gimple *inc;
4925   tree ptr;
4926   tree data_ref;
4927   basic_block new_bb;
4928   tree msq_init = NULL_TREE;
4929   tree new_temp;
4930   gphi *phi_stmt;
4931   tree msq = NULL_TREE;
4932   gimple_seq stmts = NULL;
4933   bool inv_p;
4934   bool compute_in_loop = false;
4935   bool nested_in_vect_loop = false;
4936   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4937   struct loop *loop_for_initial_load = NULL;
4938
4939   if (loop_vinfo)
4940     {
4941       loop = LOOP_VINFO_LOOP (loop_vinfo);
4942       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4943     }
4944
4945   gcc_assert (alignment_support_scheme == dr_explicit_realign
4946               || alignment_support_scheme == dr_explicit_realign_optimized);
4947
4948   /* We need to generate three things:
4949      1. the misalignment computation
4950      2. the extra vector load (for the optimized realignment scheme).
4951      3. the phi node for the two vectors from which the realignment is
4952       done (for the optimized realignment scheme).  */
4953
4954   /* 1. Determine where to generate the misalignment computation.
4955
4956      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4957      calculation will be generated by this function, outside the loop (in the
4958      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4959      caller, inside the loop.
4960
4961      Background: If the misalignment remains fixed throughout the iterations of
4962      the loop, then both realignment schemes are applicable, and also the
4963      misalignment computation can be done outside LOOP.  This is because we are
4964      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4965      are a multiple of VS (the Vector Size), and therefore the misalignment in
4966      different vectorized LOOP iterations is always the same.
4967      The problem arises only if the memory access is in an inner-loop nested
4968      inside LOOP, which is now being vectorized using outer-loop vectorization.
4969      This is the only case when the misalignment of the memory access may not
4970      remain fixed throughout the iterations of the inner-loop (as explained in
4971      detail in vect_supportable_dr_alignment).  In this case, not only is the
4972      optimized realignment scheme not applicable, but also the misalignment
4973      computation (and generation of the realignment token that is passed to
4974      REALIGN_LOAD) have to be done inside the loop.
4975
4976      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4977      or not, which in turn determines if the misalignment is computed inside
4978      the inner-loop, or outside LOOP.  */
4979
4980   if (init_addr != NULL_TREE || !loop_vinfo)
4981     {
4982       compute_in_loop = true;
4983       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4984     }
4985
4986
4987   /* 2. Determine where to generate the extra vector load.
4988
4989      For the optimized realignment scheme, instead of generating two vector
4990      loads in each iteration, we generate a single extra vector load in the
4991      preheader of the loop, and in each iteration reuse the result of the
4992      vector load from the previous iteration.  In case the memory access is in
4993      an inner-loop nested inside LOOP, which is now being vectorized using
4994      outer-loop vectorization, we need to determine whether this initial vector
4995      load should be generated at the preheader of the inner-loop, or can be
4996      generated at the preheader of LOOP.  If the memory access has no evolution
4997      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4998      to be generated inside LOOP (in the preheader of the inner-loop).  */
4999
5000   if (nested_in_vect_loop)
5001     {
5002       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5003       bool invariant_in_outerloop =
5004             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5005       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5006     }
5007   else
5008     loop_for_initial_load = loop;
5009   if (at_loop)
5010     *at_loop = loop_for_initial_load;
5011
5012   if (loop_for_initial_load)
5013     pe = loop_preheader_edge (loop_for_initial_load);
5014
5015   /* 3. For the case of the optimized realignment, create the first vector
5016       load at the loop preheader.  */
5017
5018   if (alignment_support_scheme == dr_explicit_realign_optimized)
5019     {
5020       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5021       gassign *new_stmt;
5022
5023       gcc_assert (!compute_in_loop);
5024       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5025       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5026                                       NULL_TREE, &init_addr, NULL, &inc,
5027                                       true, &inv_p);
5028       if (TREE_CODE (ptr) == SSA_NAME)
5029         new_temp = copy_ssa_name (ptr);
5030       else
5031         new_temp = make_ssa_name (TREE_TYPE (ptr));
5032       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5033       new_stmt = gimple_build_assign
5034                    (new_temp, BIT_AND_EXPR, ptr,
5035                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5036       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5037       gcc_assert (!new_bb);
5038       data_ref
5039         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5040                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5041       new_stmt = gimple_build_assign (vec_dest, data_ref);
5042       new_temp = make_ssa_name (vec_dest, new_stmt);
5043       gimple_assign_set_lhs (new_stmt, new_temp);
5044       if (pe)
5045         {
5046           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5047           gcc_assert (!new_bb);
5048         }
5049       else
5050          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5051
5052       msq_init = gimple_assign_lhs (new_stmt);
5053     }
5054
5055   /* 4. Create realignment token using a target builtin, if available.
5056       It is done either inside the containing loop, or before LOOP (as
5057       determined above).  */
5058
5059   if (targetm.vectorize.builtin_mask_for_load)
5060     {
5061       gcall *new_stmt;
5062       tree builtin_decl;
5063
5064       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5065       if (!init_addr)
5066         {
5067           /* Generate the INIT_ADDR computation outside LOOP.  */
5068           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5069                                                             NULL_TREE);
5070           if (loop)
5071             {
5072               pe = loop_preheader_edge (loop);
5073               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5074               gcc_assert (!new_bb);
5075             }
5076           else
5077              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5078         }
5079
5080       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5081       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5082       vec_dest =
5083         vect_create_destination_var (scalar_dest,
5084                                      gimple_call_return_type (new_stmt));
5085       new_temp = make_ssa_name (vec_dest, new_stmt);
5086       gimple_call_set_lhs (new_stmt, new_temp);
5087
5088       if (compute_in_loop)
5089         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5090       else
5091         {
5092           /* Generate the misalignment computation outside LOOP.  */
5093           pe = loop_preheader_edge (loop);
5094           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5095           gcc_assert (!new_bb);
5096         }
5097
5098       *realignment_token = gimple_call_lhs (new_stmt);
5099
5100       /* The result of the CALL_EXPR to this builtin is determined from
5101          the value of the parameter and no global variables are touched
5102          which makes the builtin a "const" function.  Requiring the
5103          builtin to have the "const" attribute makes it unnecessary
5104          to call mark_call_clobbered.  */
5105       gcc_assert (TREE_READONLY (builtin_decl));
5106     }
5107
5108   if (alignment_support_scheme == dr_explicit_realign)
5109     return msq;
5110
5111   gcc_assert (!compute_in_loop);
5112   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5113
5114
5115   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5116
5117   pe = loop_preheader_edge (containing_loop);
5118   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5119   msq = make_ssa_name (vec_dest);
5120   phi_stmt = create_phi_node (msq, containing_loop->header);
5121   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5122
5123   return msq;
5124 }
5125
5126
5127 /* Function vect_grouped_load_supported.
5128
5129    COUNT is the size of the load group (the number of statements plus the
5130    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5131    only one statement, with a gap of COUNT - 1.
5132
5133    Returns true if a suitable permute exists.  */
5134
5135 bool
5136 vect_grouped_load_supported (tree vectype, bool single_element_p,
5137                              unsigned HOST_WIDE_INT count)
5138 {
5139   machine_mode mode = TYPE_MODE (vectype);
5140
5141   /* If this is single-element interleaving with an element distance
5142      that leaves unused vector loads around punt - we at least create
5143      very sub-optimal code in that case (and blow up memory,
5144      see PR65518).  */
5145   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5146     {
5147       if (dump_enabled_p ())
5148         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5149                          "single-element interleaving not supported "
5150                          "for not adjacent vector loads\n");
5151       return false;
5152     }
5153
5154   /* vect_permute_load_chain requires the group size to be equal to 3 or
5155      be a power of two.  */
5156   if (count != 3 && exact_log2 (count) == -1)
5157     {
5158       if (dump_enabled_p ())
5159         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5160                          "the size of the group of accesses"
5161                          " is not a power of 2 or not equal to 3\n");
5162       return false;
5163     }
5164
5165   /* Check that the permutation is supported.  */
5166   if (VECTOR_MODE_P (mode))
5167     {
5168       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5169       auto_vec_perm_indices sel (nelt);
5170       sel.quick_grow (nelt);
5171
5172       if (count == 3)
5173         {
5174           unsigned int k;
5175           for (k = 0; k < 3; k++)
5176             {
5177               for (i = 0; i < nelt; i++)
5178                 if (3 * i + k < 2 * nelt)
5179                   sel[i] = 3 * i + k;
5180                 else
5181                   sel[i] = 0;
5182               if (!can_vec_perm_p (mode, false, &sel))
5183                 {
5184                   if (dump_enabled_p ())
5185                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5186                                      "shuffle of 3 loads is not supported by"
5187                                      " target\n");
5188                   return false;
5189                 }
5190               for (i = 0, j = 0; i < nelt; i++)
5191                 if (3 * i + k < 2 * nelt)
5192                   sel[i] = i;
5193                 else
5194                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5195               if (!can_vec_perm_p (mode, false, &sel))
5196                 {
5197                   if (dump_enabled_p ())
5198                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199                                      "shuffle of 3 loads is not supported by"
5200                                      " target\n");
5201                   return false;
5202                 }
5203             }
5204           return true;
5205         }
5206       else
5207         {
5208           /* If length is not equal to 3 then only power of 2 is supported.  */
5209           gcc_assert (pow2p_hwi (count));
5210           for (i = 0; i < nelt; i++)
5211             sel[i] = i * 2;
5212           if (can_vec_perm_p (mode, false, &sel))
5213             {
5214               for (i = 0; i < nelt; i++)
5215                 sel[i] = i * 2 + 1;
5216               if (can_vec_perm_p (mode, false, &sel))
5217                 return true;
5218             }
5219         }
5220     }
5221
5222   if (dump_enabled_p ())
5223     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5224                      "extract even/odd not supported by target\n");
5225   return false;
5226 }
5227
5228 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5229    type VECTYPE.  */
5230
5231 bool
5232 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5233 {
5234   return vect_lanes_optab_supported_p ("vec_load_lanes",
5235                                        vec_load_lanes_optab,
5236                                        vectype, count);
5237 }
5238
5239 /* Function vect_permute_load_chain.
5240
5241    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5242    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5243    the input data correctly.  Return the final references for loads in
5244    RESULT_CHAIN.
5245
5246    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5247    The input is 4 vectors each containing 8 elements. We assign a number to each
5248    element, the input sequence is:
5249
5250    1st vec:   0  1  2  3  4  5  6  7
5251    2nd vec:   8  9 10 11 12 13 14 15
5252    3rd vec:  16 17 18 19 20 21 22 23
5253    4th vec:  24 25 26 27 28 29 30 31
5254
5255    The output sequence should be:
5256
5257    1st vec:  0 4  8 12 16 20 24 28
5258    2nd vec:  1 5  9 13 17 21 25 29
5259    3rd vec:  2 6 10 14 18 22 26 30
5260    4th vec:  3 7 11 15 19 23 27 31
5261
5262    i.e., the first output vector should contain the first elements of each
5263    interleaving group, etc.
5264
5265    We use extract_even/odd instructions to create such output.  The input of
5266    each extract_even/odd operation is two vectors
5267    1st vec    2nd vec
5268    0 1 2 3    4 5 6 7
5269
5270    and the output is the vector of extracted even/odd elements.  The output of
5271    extract_even will be:   0 2 4 6
5272    and of extract_odd:     1 3 5 7
5273
5274
5275    The permutation is done in log LENGTH stages.  In each stage extract_even
5276    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5277    their order.  In our example,
5278
5279    E1: extract_even (1st vec, 2nd vec)
5280    E2: extract_odd (1st vec, 2nd vec)
5281    E3: extract_even (3rd vec, 4th vec)
5282    E4: extract_odd (3rd vec, 4th vec)
5283
5284    The output for the first stage will be:
5285
5286    E1:  0  2  4  6  8 10 12 14
5287    E2:  1  3  5  7  9 11 13 15
5288    E3: 16 18 20 22 24 26 28 30
5289    E4: 17 19 21 23 25 27 29 31
5290
5291    In order to proceed and create the correct sequence for the next stage (or
5292    for the correct output, if the second stage is the last one, as in our
5293    example), we first put the output of extract_even operation and then the
5294    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5295    The input for the second stage is:
5296
5297    1st vec (E1):  0  2  4  6  8 10 12 14
5298    2nd vec (E3): 16 18 20 22 24 26 28 30
5299    3rd vec (E2):  1  3  5  7  9 11 13 15
5300    4th vec (E4): 17 19 21 23 25 27 29 31
5301
5302    The output of the second stage:
5303
5304    E1: 0 4  8 12 16 20 24 28
5305    E2: 2 6 10 14 18 22 26 30
5306    E3: 1 5  9 13 17 21 25 29
5307    E4: 3 7 11 15 19 23 27 31
5308
5309    And RESULT_CHAIN after reordering:
5310
5311    1st vec (E1):  0 4  8 12 16 20 24 28
5312    2nd vec (E3):  1 5  9 13 17 21 25 29
5313    3rd vec (E2):  2 6 10 14 18 22 26 30
5314    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5315
5316 static void
5317 vect_permute_load_chain (vec<tree> dr_chain,
5318                          unsigned int length,
5319                          gimple *stmt,
5320                          gimple_stmt_iterator *gsi,
5321                          vec<tree> *result_chain)
5322 {
5323   tree data_ref, first_vect, second_vect;
5324   tree perm_mask_even, perm_mask_odd;
5325   tree perm3_mask_low, perm3_mask_high;
5326   gimple *perm_stmt;
5327   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5328   unsigned int i, j, log_length = exact_log2 (length);
5329   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5330
5331   auto_vec_perm_indices sel (nelt);
5332   sel.quick_grow (nelt);
5333
5334   result_chain->quick_grow (length);
5335   memcpy (result_chain->address (), dr_chain.address (),
5336           length * sizeof (tree));
5337
5338   if (length == 3)
5339     {
5340       unsigned int k;
5341
5342       for (k = 0; k < 3; k++)
5343         {
5344           for (i = 0; i < nelt; i++)
5345             if (3 * i + k < 2 * nelt)
5346               sel[i] = 3 * i + k;
5347             else
5348               sel[i] = 0;
5349           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5350
5351           for (i = 0, j = 0; i < nelt; i++)
5352             if (3 * i + k < 2 * nelt)
5353               sel[i] = i;
5354             else
5355               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5356
5357           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5358
5359           first_vect = dr_chain[0];
5360           second_vect = dr_chain[1];
5361
5362           /* Create interleaving stmt (low part of):
5363              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5364                                                              ...}>  */
5365           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5366           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5367                                            second_vect, perm3_mask_low);
5368           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5369
5370           /* Create interleaving stmt (high part of):
5371              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5372                                                               ...}>  */
5373           first_vect = data_ref;
5374           second_vect = dr_chain[2];
5375           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5376           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5377                                            second_vect, perm3_mask_high);
5378           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5379           (*result_chain)[k] = data_ref;
5380         }
5381     }
5382   else
5383     {
5384       /* If length is not equal to 3 then only power of 2 is supported.  */
5385       gcc_assert (pow2p_hwi (length));
5386
5387       for (i = 0; i < nelt; ++i)
5388         sel[i] = i * 2;
5389       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5390
5391       for (i = 0; i < nelt; ++i)
5392         sel[i] = i * 2 + 1;
5393       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5394
5395       for (i = 0; i < log_length; i++)
5396         {
5397           for (j = 0; j < length; j += 2)
5398             {
5399               first_vect = dr_chain[j];
5400               second_vect = dr_chain[j+1];
5401
5402               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5403               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5404               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5405                                                first_vect, second_vect,
5406                                                perm_mask_even);
5407               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408               (*result_chain)[j/2] = data_ref;
5409
5410               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5411               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5412               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5413                                                first_vect, second_vect,
5414                                                perm_mask_odd);
5415               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5416               (*result_chain)[j/2+length/2] = data_ref;
5417             }
5418           memcpy (dr_chain.address (), result_chain->address (),
5419                   length * sizeof (tree));
5420         }
5421     }
5422 }
5423
5424 /* Function vect_shift_permute_load_chain.
5425
5426    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5427    sequence of stmts to reorder the input data accordingly.
5428    Return the final references for loads in RESULT_CHAIN.
5429    Return true if successed, false otherwise.
5430
5431    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5432    The input is 3 vectors each containing 8 elements.  We assign a
5433    number to each element, the input sequence is:
5434
5435    1st vec:   0  1  2  3  4  5  6  7
5436    2nd vec:   8  9 10 11 12 13 14 15
5437    3rd vec:  16 17 18 19 20 21 22 23
5438
5439    The output sequence should be:
5440
5441    1st vec:  0 3 6  9 12 15 18 21
5442    2nd vec:  1 4 7 10 13 16 19 22
5443    3rd vec:  2 5 8 11 14 17 20 23
5444
5445    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5446
5447    First we shuffle all 3 vectors to get correct elements order:
5448
5449    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5450    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5451    3rd vec:  (16 19 22) (17 20 23) (18 21)
5452
5453    Next we unite and shift vector 3 times:
5454
5455    1st step:
5456      shift right by 6 the concatenation of:
5457      "1st vec" and  "2nd vec"
5458        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5459      "2nd vec" and  "3rd vec"
5460        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5461      "3rd vec" and  "1st vec"
5462        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5463                              | New vectors                   |
5464
5465      So that now new vectors are:
5466
5467      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5468      2nd vec:  (10 13) (16 19 22) (17 20 23)
5469      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5470
5471    2nd step:
5472      shift right by 5 the concatenation of:
5473      "1st vec" and  "3rd vec"
5474        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5475      "2nd vec" and  "1st vec"
5476        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5477      "3rd vec" and  "2nd vec"
5478        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5479                           | New vectors                   |
5480
5481      So that now new vectors are:
5482
5483      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5484      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5485      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5486
5487    3rd step:
5488      shift right by 5 the concatenation of:
5489      "1st vec" and  "1st vec"
5490        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5491      shift right by 3 the concatenation of:
5492      "2nd vec" and  "2nd vec"
5493                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5494                           | New vectors                   |
5495
5496      So that now all vectors are READY:
5497      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5498      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5499      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5500
5501    This algorithm is faster than one in vect_permute_load_chain if:
5502      1.  "shift of a concatination" is faster than general permutation.
5503          This is usually so.
5504      2.  The TARGET machine can't execute vector instructions in parallel.
5505          This is because each step of the algorithm depends on previous.
5506          The algorithm in vect_permute_load_chain is much more parallel.
5507
5508    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5509 */
5510
5511 static bool
5512 vect_shift_permute_load_chain (vec<tree> dr_chain,
5513                                unsigned int length,
5514                                gimple *stmt,
5515                                gimple_stmt_iterator *gsi,
5516                                vec<tree> *result_chain)
5517 {
5518   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5519   tree perm2_mask1, perm2_mask2, perm3_mask;
5520   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5521   gimple *perm_stmt;
5522
5523   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5524   unsigned int i;
5525   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5526   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5527   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5528
5529   auto_vec_perm_indices sel (nelt);
5530   sel.quick_grow (nelt);
5531
5532   result_chain->quick_grow (length);
5533   memcpy (result_chain->address (), dr_chain.address (),
5534           length * sizeof (tree));
5535
5536   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5537     {
5538       unsigned int j, log_length = exact_log2 (length);
5539       for (i = 0; i < nelt / 2; ++i)
5540         sel[i] = i * 2;
5541       for (i = 0; i < nelt / 2; ++i)
5542         sel[nelt / 2 + i] = i * 2 + 1;
5543       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5544         {
5545           if (dump_enabled_p ())
5546             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5547                              "shuffle of 2 fields structure is not \
5548                               supported by target\n");
5549           return false;
5550         }
5551       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5552
5553       for (i = 0; i < nelt / 2; ++i)
5554         sel[i] = i * 2 + 1;
5555       for (i = 0; i < nelt / 2; ++i)
5556         sel[nelt / 2 + i] = i * 2;
5557       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5558         {
5559           if (dump_enabled_p ())
5560             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5561                              "shuffle of 2 fields structure is not \
5562                               supported by target\n");
5563           return false;
5564         }
5565       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5566
5567       /* Generating permutation constant to shift all elements.
5568          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5569       for (i = 0; i < nelt; i++)
5570         sel[i] = nelt / 2 + i;
5571       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5572         {
5573           if (dump_enabled_p ())
5574             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5575                              "shift permutation is not supported by target\n");
5576           return false;
5577         }
5578       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5579
5580       /* Generating permutation constant to select vector from 2.
5581          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5582       for (i = 0; i < nelt / 2; i++)
5583         sel[i] = i;
5584       for (i = nelt / 2; i < nelt; i++)
5585         sel[i] = nelt + i;
5586       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5587         {
5588           if (dump_enabled_p ())
5589             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5590                              "select is not supported by target\n");
5591           return false;
5592         }
5593       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5594
5595       for (i = 0; i < log_length; i++)
5596         {
5597           for (j = 0; j < length; j += 2)
5598             {
5599               first_vect = dr_chain[j];
5600               second_vect = dr_chain[j + 1];
5601
5602               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5603               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5604                                                first_vect, first_vect,
5605                                                perm2_mask1);
5606               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5607               vect[0] = data_ref;
5608
5609               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5610               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5611                                                second_vect, second_vect,
5612                                                perm2_mask2);
5613               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5614               vect[1] = data_ref;
5615
5616               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5617               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5618                                                vect[0], vect[1], shift1_mask);
5619               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5620               (*result_chain)[j/2 + length/2] = data_ref;
5621
5622               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5623               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5624                                                vect[0], vect[1], select_mask);
5625               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5626               (*result_chain)[j/2] = data_ref;
5627             }
5628           memcpy (dr_chain.address (), result_chain->address (),
5629                   length * sizeof (tree));
5630         }
5631       return true;
5632     }
5633   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5634     {
5635       unsigned int k = 0, l = 0;
5636
5637       /* Generating permutation constant to get all elements in rigth order.
5638          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5639       for (i = 0; i < nelt; i++)
5640         {
5641           if (3 * k + (l % 3) >= nelt)
5642             {
5643               k = 0;
5644               l += (3 - (nelt % 3));
5645             }
5646           sel[i] = 3 * k + (l % 3);
5647           k++;
5648         }
5649       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5650         {
5651           if (dump_enabled_p ())
5652             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5653                              "shuffle of 3 fields structure is not \
5654                               supported by target\n");
5655           return false;
5656         }
5657       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5658
5659       /* Generating permutation constant to shift all elements.
5660          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5661       for (i = 0; i < nelt; i++)
5662         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5663       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5664         {
5665           if (dump_enabled_p ())
5666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667                              "shift permutation is not supported by target\n");
5668           return false;
5669         }
5670       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5671
5672       /* Generating permutation constant to shift all elements.
5673          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5674       for (i = 0; i < nelt; i++)
5675         sel[i] = 2 * (nelt / 3) + 1 + i;
5676       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5677         {
5678           if (dump_enabled_p ())
5679             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680                              "shift permutation is not supported by target\n");
5681           return false;
5682         }
5683       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5684
5685       /* Generating permutation constant to shift all elements.
5686          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5687       for (i = 0; i < nelt; i++)
5688         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5689       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5690         {
5691           if (dump_enabled_p ())
5692             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693                              "shift permutation is not supported by target\n");
5694           return false;
5695         }
5696       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5697
5698       /* Generating permutation constant to shift all elements.
5699          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5700       for (i = 0; i < nelt; i++)
5701         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5702       if (!can_vec_perm_p (TYPE_MODE (vectype), false, &sel))
5703         {
5704           if (dump_enabled_p ())
5705             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5706                              "shift permutation is not supported by target\n");
5707           return false;
5708         }
5709       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5710
5711       for (k = 0; k < 3; k++)
5712         {
5713           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5714           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5715                                            dr_chain[k], dr_chain[k],
5716                                            perm3_mask);
5717           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5718           vect[k] = data_ref;
5719         }
5720
5721       for (k = 0; k < 3; k++)
5722         {
5723           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5724           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5725                                            vect[k % 3], vect[(k + 1) % 3],
5726                                            shift1_mask);
5727           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5728           vect_shift[k] = data_ref;
5729         }
5730
5731       for (k = 0; k < 3; k++)
5732         {
5733           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5734           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5735                                            vect_shift[(4 - k) % 3],
5736                                            vect_shift[(3 - k) % 3],
5737                                            shift2_mask);
5738           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5739           vect[k] = data_ref;
5740         }
5741
5742       (*result_chain)[3 - (nelt % 3)] = vect[2];
5743
5744       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5745       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5746                                        vect[0], shift3_mask);
5747       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5748       (*result_chain)[nelt % 3] = data_ref;
5749
5750       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5751       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5752                                        vect[1], shift4_mask);
5753       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5754       (*result_chain)[0] = data_ref;
5755       return true;
5756     }
5757   return false;
5758 }
5759
5760 /* Function vect_transform_grouped_load.
5761
5762    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5763    to perform their permutation and ascribe the result vectorized statements to
5764    the scalar statements.
5765 */
5766
5767 void
5768 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5769                              gimple_stmt_iterator *gsi)
5770 {
5771   machine_mode mode;
5772   vec<tree> result_chain = vNULL;
5773
5774   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5775      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5776      vectors, that are ready for vector computation.  */
5777   result_chain.create (size);
5778
5779   /* If reassociation width for vector type is 2 or greater target machine can
5780      execute 2 or more vector instructions in parallel.  Otherwise try to
5781      get chain for loads group using vect_shift_permute_load_chain.  */
5782   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5783   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5784       || pow2p_hwi (size)
5785       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5786                                          gsi, &result_chain))
5787     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5788   vect_record_grouped_load_vectors (stmt, result_chain);
5789   result_chain.release ();
5790 }
5791
5792 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5793    generated as part of the vectorization of STMT.  Assign the statement
5794    for each vector to the associated scalar statement.  */
5795
5796 void
5797 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5798 {
5799   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5800   gimple *next_stmt, *new_stmt;
5801   unsigned int i, gap_count;
5802   tree tmp_data_ref;
5803
5804   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5805      Since we scan the chain starting from it's first node, their order
5806      corresponds the order of data-refs in RESULT_CHAIN.  */
5807   next_stmt = first_stmt;
5808   gap_count = 1;
5809   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5810     {
5811       if (!next_stmt)
5812         break;
5813
5814       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5815        code elimination pass later.  No need to check for the first stmt in
5816        the group, since it always exists.
5817        GROUP_GAP is the number of steps in elements from the previous
5818        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5819        correspond to the gaps.  */
5820       if (next_stmt != first_stmt
5821           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5822       {
5823         gap_count++;
5824         continue;
5825       }
5826
5827       while (next_stmt)
5828         {
5829           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5830           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5831              copies, and we put the new vector statement in the first available
5832              RELATED_STMT.  */
5833           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5834             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5835           else
5836             {
5837               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5838                 {
5839                   gimple *prev_stmt =
5840                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5841                   gimple *rel_stmt =
5842                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5843                   while (rel_stmt)
5844                     {
5845                       prev_stmt = rel_stmt;
5846                       rel_stmt =
5847                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5848                     }
5849
5850                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5851                     new_stmt;
5852                 }
5853             }
5854
5855           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5856           gap_count = 1;
5857           /* If NEXT_STMT accesses the same DR as the previous statement,
5858              put the same TMP_DATA_REF as its vectorized statement; otherwise
5859              get the next data-ref from RESULT_CHAIN.  */
5860           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5861             break;
5862         }
5863     }
5864 }
5865
5866 /* Function vect_force_dr_alignment_p.
5867
5868    Returns whether the alignment of a DECL can be forced to be aligned
5869    on ALIGNMENT bit boundary.  */
5870
5871 bool
5872 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5873 {
5874   if (!VAR_P (decl))
5875     return false;
5876
5877   if (decl_in_symtab_p (decl)
5878       && !symtab_node::get (decl)->can_increase_alignment_p ())
5879     return false;
5880
5881   if (TREE_STATIC (decl))
5882     return (alignment <= MAX_OFILE_ALIGNMENT);
5883   else
5884     return (alignment <= MAX_STACK_ALIGNMENT);
5885 }
5886
5887
5888 /* Return whether the data reference DR is supported with respect to its
5889    alignment.
5890    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5891    it is aligned, i.e., check if it is possible to vectorize it with different
5892    alignment.  */
5893
5894 enum dr_alignment_support
5895 vect_supportable_dr_alignment (struct data_reference *dr,
5896                                bool check_aligned_accesses)
5897 {
5898   gimple *stmt = DR_STMT (dr);
5899   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5900   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5901   machine_mode mode = TYPE_MODE (vectype);
5902   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5903   struct loop *vect_loop = NULL;
5904   bool nested_in_vect_loop = false;
5905
5906   if (aligned_access_p (dr) && !check_aligned_accesses)
5907     return dr_aligned;
5908
5909   /* For now assume all conditional loads/stores support unaligned
5910      access without any special code.  */
5911   if (is_gimple_call (stmt)
5912       && gimple_call_internal_p (stmt)
5913       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5914           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5915     return dr_unaligned_supported;
5916
5917   if (loop_vinfo)
5918     {
5919       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5920       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5921     }
5922
5923   /* Possibly unaligned access.  */
5924
5925   /* We can choose between using the implicit realignment scheme (generating
5926      a misaligned_move stmt) and the explicit realignment scheme (generating
5927      aligned loads with a REALIGN_LOAD).  There are two variants to the
5928      explicit realignment scheme: optimized, and unoptimized.
5929      We can optimize the realignment only if the step between consecutive
5930      vector loads is equal to the vector size.  Since the vector memory
5931      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5932      is guaranteed that the misalignment amount remains the same throughout the
5933      execution of the vectorized loop.  Therefore, we can create the
5934      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5935      at the loop preheader.
5936
5937      However, in the case of outer-loop vectorization, when vectorizing a
5938      memory access in the inner-loop nested within the LOOP that is now being
5939      vectorized, while it is guaranteed that the misalignment of the
5940      vectorized memory access will remain the same in different outer-loop
5941      iterations, it is *not* guaranteed that is will remain the same throughout
5942      the execution of the inner-loop.  This is because the inner-loop advances
5943      with the original scalar step (and not in steps of VS).  If the inner-loop
5944      step happens to be a multiple of VS, then the misalignment remains fixed
5945      and we can use the optimized realignment scheme.  For example:
5946
5947       for (i=0; i<N; i++)
5948         for (j=0; j<M; j++)
5949           s += a[i+j];
5950
5951      When vectorizing the i-loop in the above example, the step between
5952      consecutive vector loads is 1, and so the misalignment does not remain
5953      fixed across the execution of the inner-loop, and the realignment cannot
5954      be optimized (as illustrated in the following pseudo vectorized loop):
5955
5956       for (i=0; i<N; i+=4)
5957         for (j=0; j<M; j++){
5958           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5959                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5960                          // (assuming that we start from an aligned address).
5961           }
5962
5963      We therefore have to use the unoptimized realignment scheme:
5964
5965       for (i=0; i<N; i+=4)
5966           for (j=k; j<M; j+=4)
5967           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5968                            // that the misalignment of the initial address is
5969                            // 0).
5970
5971      The loop can then be vectorized as follows:
5972
5973       for (k=0; k<4; k++){
5974         rt = get_realignment_token (&vp[k]);
5975         for (i=0; i<N; i+=4){
5976           v1 = vp[i+k];
5977           for (j=k; j<M; j+=4){
5978             v2 = vp[i+j+VS-1];
5979             va = REALIGN_LOAD <v1,v2,rt>;
5980             vs += va;
5981             v1 = v2;
5982           }
5983         }
5984     } */
5985
5986   if (DR_IS_READ (dr))
5987     {
5988       bool is_packed = false;
5989       tree type = (TREE_TYPE (DR_REF (dr)));
5990
5991       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5992           && (!targetm.vectorize.builtin_mask_for_load
5993               || targetm.vectorize.builtin_mask_for_load ()))
5994         {
5995           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5996
5997           /* If we are doing SLP then the accesses need not have the
5998              same alignment, instead it depends on the SLP group size.  */
5999           if (loop_vinfo
6000               && STMT_SLP_TYPE (stmt_info)
6001               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6002                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6003                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6004             ;
6005           else if (!loop_vinfo
6006                    || (nested_in_vect_loop
6007                        && (TREE_INT_CST_LOW (DR_STEP (dr))
6008                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6009             return dr_explicit_realign;
6010           else
6011             return dr_explicit_realign_optimized;
6012         }
6013       if (!known_alignment_for_access_p (dr))
6014         is_packed = not_size_aligned (DR_REF (dr));
6015
6016       if (targetm.vectorize.support_vector_misalignment
6017             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6018         /* Can't software pipeline the loads, but can at least do them.  */
6019         return dr_unaligned_supported;
6020     }
6021   else
6022     {
6023       bool is_packed = false;
6024       tree type = (TREE_TYPE (DR_REF (dr)));
6025
6026       if (!known_alignment_for_access_p (dr))
6027         is_packed = not_size_aligned (DR_REF (dr));
6028
6029      if (targetm.vectorize.support_vector_misalignment
6030            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6031        return dr_unaligned_supported;
6032     }
6033
6034   /* Unsupported.  */
6035   return dr_unaligned_unsupported;
6036 }