gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode;
  65   scalar_int_mode array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                           limit_p).exists (&array_mode))
  72     {
  73       if (dump_enabled_p ())
  74         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  75                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  76                          GET_MODE_NAME (mode), count);
  77       return false;
  78     }
  79
  80   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  81     {
  82       if (dump_enabled_p ())
  83         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  84                          "cannot use %s<%s><%s>\n", name,
  85                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  86       return false;
  87     }
  88
  89   if (dump_enabled_p ())
  90     dump_printf_loc (MSG_NOTE, vect_location,
  91                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  92                      GET_MODE_NAME (mode));
  93
  94   return true;
  95 }
  96
  97
  98 /* Return the smallest scalar part of STMT.
  99    This is used to determine the vectype of the stmt.  We generally set the
 100    vectype according to the type of the result (lhs).  For stmts whose
 101    result-type is different than the type of the arguments (e.g., demotion,
 102    promotion), vectype will be reset appropriately (later).  Note that we have
 103    to visit the smallest datatype in this function, because that determines the
 104    VF.  If the smallest datatype in the loop is present only as the rhs of a
 105    promotion operation - we'd miss it.
 106    Such a case, where a variable of this datatype does not appear in the lhs
 107    anywhere in the loop, can only occur if it's an invariant: e.g.:
 108    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 109    invariant motion.  However, we cannot rely on invariant motion to always
 110    take invariants out of the loop, and so in the case of promotion we also
 111    have to check the rhs.
 112    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 113    types.  */
 114
 115 tree
 116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 117                                HOST_WIDE_INT *rhs_size_unit)
 118 {
 119   tree scalar_type = gimple_expr_type (stmt);
 120   HOST_WIDE_INT lhs, rhs;
 121
 122   /* During the analysis phase, this function is called on arbitrary
 123      statements that might not have scalar results.  */
 124   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 125     return scalar_type;
 126
 127   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 128
 129   if (is_gimple_assign (stmt)
 130       && (gimple_assign_cast_p (stmt)
 131           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 132           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 133           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 134     {
 135       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 136
 137       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 138       if (rhs < lhs)
 139         scalar_type = rhs_type;
 140     }
 141
 142   *lhs_size_unit = lhs;
 143   *rhs_size_unit = rhs;
 144   return scalar_type;
 145 }
 146
 147
 148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 149    tested at run-time.  Return TRUE if DDR was successfully inserted.
 150    Return false if versioning is not supported.  */
 151
 152 static bool
 153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 154 {
 155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 156
 157   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 158     return false;
 159
 160   if (!runtime_alias_check_p (ddr, loop,
 161                               optimize_loop_nest_for_speed_p (loop)))
 162     return false;
 163
 164   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 165   return true;
 166 }
 167
 168
 169 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 170    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 171    distances.  These distances are conservatively correct but they don't
 172    reflect a guaranteed dependence.
 173
 174    Return true if this function does all the work necessary to avoid
 175    an alias or false if the caller should use the dependence distances
 176    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 177    the depth of the loop described by LOOP_VINFO and the other arguments
 178    are as for vect_analyze_data_ref_dependence.  */
 179
 180 static bool
 181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 182                                        loop_vec_info loop_vinfo,
 183                                        int loop_depth, unsigned int *max_vf)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   lambda_vector dist_v;
 187   unsigned int i;
 188   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 189     {
 190       int dist = dist_v[loop_depth];
 191       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 192         {
 193           /* If the user asserted safelen >= DIST consecutive iterations
 194              can be executed concurrently, assume independence.
 195
 196              ??? An alternative would be to add the alias check even
 197              in this case, and vectorize the fallback loop with the
 198              maximum VF set to safelen.  However, if the user has
 199              explicitly given a length, it's less likely that that
 200              would be a win.  */
 201           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 202             {
 203               if ((unsigned int) loop->safelen < *max_vf)
 204                 *max_vf = loop->safelen;
 205               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 206               continue;
 207             }
 208
 209           /* For dependence distances of 2 or more, we have the option
 210              of limiting VF or checking for an alias at runtime.
 211              Prefer to check at runtime if we can, to avoid limiting
 212              the VF unnecessarily when the bases are in fact independent.
 213
 214              Note that the alias checks will be removed if the VF ends up
 215              being small enough.  */
 216           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 217         }
 218     }
 219   return true;
 220 }
 221
 222
 223 /* Function vect_analyze_data_ref_dependence.
 224
 225    Return TRUE if there (might) exist a dependence between a memory-reference
 226    DRA and a memory-reference DRB.  When versioning for alias may check a
 227    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 228    the data dependence.  */
 229
 230 static bool
 231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 232                                   loop_vec_info loop_vinfo,
 233                                   unsigned int *max_vf)
 234 {
 235   unsigned int i;
 236   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 237   struct data_reference *dra = DDR_A (ddr);
 238   struct data_reference *drb = DDR_B (ddr);
 239   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 240   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 241   lambda_vector dist_v;
 242   unsigned int loop_depth;
 243
 244   /* In loop analysis all data references should be vectorizable.  */
 245   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 246       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 247     gcc_unreachable ();
 248
 249   /* Independent data accesses.  */
 250   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 251     return false;
 252
 253   if (dra == drb
 254       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 255     return false;
 256
 257   /* We do not have to consider dependences between accesses that belong
 258      to the same group.  */
 259   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 260       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 261     return false;
 262
 263   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 264      least two scalar iterations, there is always also a true dependence.
 265      As the vectorizer does not re-order loads and stores we can ignore
 266      the anti-dependence if TBAA can disambiguate both DRs similar to the
 267      case with known negative distance anti-dependences (positive
 268      distance anti-dependences would violate TBAA constraints).  */
 269   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 270        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 271       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 272                                  get_alias_set (DR_REF (drb))))
 273     return false;
 274
 275   /* Unknown data dependence.  */
 276   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 277     {
 278       /* If user asserted safelen consecutive iterations can be
 279          executed concurrently, assume independence.  */
 280       if (loop->safelen >= 2)
 281         {
 282           if ((unsigned int) loop->safelen < *max_vf)
 283             *max_vf = loop->safelen;
 284           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 285           return false;
 286         }
 287
 288       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 289           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 290         {
 291           if (dump_enabled_p ())
 292             {
 293               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 294                                "versioning for alias not supported for: "
 295                                "can't determine dependence between ");
 296               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 297                                  DR_REF (dra));
 298               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 299               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 300                                  DR_REF (drb));
 301               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 302             }
 303           return true;
 304         }
 305
 306       if (dump_enabled_p ())
 307         {
 308           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                            "versioning for alias required: "
 310                            "can't determine dependence between ");
 311           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                              DR_REF (dra));
 313           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                              DR_REF (drb));
 316           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317         }
 318
 319       /* Add to list of ddrs that need to be tested at run-time.  */
 320       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 321     }
 322
 323   /* Known data dependence.  */
 324   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 325     {
 326       /* If user asserted safelen consecutive iterations can be
 327          executed concurrently, assume independence.  */
 328       if (loop->safelen >= 2)
 329         {
 330           if ((unsigned int) loop->safelen < *max_vf)
 331             *max_vf = loop->safelen;
 332           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 333           return false;
 334         }
 335
 336       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 337           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 338         {
 339           if (dump_enabled_p ())
 340             {
 341               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 342                                "versioning for alias not supported for: "
 343                                "bad dist vector for ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (dra));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 347               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 348                                  DR_REF (drb));
 349               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 350             }
 351           return true;
 352         }
 353
 354       if (dump_enabled_p ())
 355         {
 356           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 357                            "versioning for alias required: "
 358                            "bad dist vector for ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 360           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 361           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 362           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363         }
 364       /* Add to list of ddrs that need to be tested at run-time.  */
 365       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 366     }
 367
 368   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 369
 370   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 371       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 372                                                 loop_depth, max_vf))
 373     return false;
 374
 375   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 376     {
 377       int dist = dist_v[loop_depth];
 378
 379       if (dump_enabled_p ())
 380         dump_printf_loc (MSG_NOTE, vect_location,
 381                          "dependence distance  = %d.\n", dist);
 382
 383       if (dist == 0)
 384         {
 385           if (dump_enabled_p ())
 386             {
 387               dump_printf_loc (MSG_NOTE, vect_location,
 388                                "dependence distance == 0 between ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 390               dump_printf (MSG_NOTE, " and ");
 391               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 392               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 393             }
 394
 395           /* When we perform grouped accesses and perform implicit CSE
 396              by detecting equal accesses and doing disambiguation with
 397              runtime alias tests like for
 398                 .. = a[i];
 399                 .. = a[i+1];
 400                 a[i] = ..;
 401                 a[i+1] = ..;
 402                 *p = ..;
 403                 .. = a[i];
 404                 .. = a[i+1];
 405              where we will end up loading { a[i], a[i+1] } once, make
 406              sure that inserting group loads before the first load and
 407              stores after the last store will do the right thing.
 408              Similar for groups like
 409                 a[i] = ...;
 410                 ... = a[i];
 411                 a[i+1] = ...;
 412              where loads from the group interleave with the store.  */
 413           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 414               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 415             {
 416               gimple *earlier_stmt;
 417               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 418               if (DR_IS_WRITE
 419                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 420                 {
 421                   if (dump_enabled_p ())
 422                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                      "READ_WRITE dependence in interleaving."
 424                                      "\n");
 425                   return true;
 426                 }
 427             }
 428
 429           continue;
 430         }
 431
 432       if (dist > 0 && DDR_REVERSED_P (ddr))
 433         {
 434           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 435              reversed (to make distance vector positive), and the actual
 436              distance is negative.  */
 437           if (dump_enabled_p ())
 438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 439                              "dependence distance negative.\n");
 440           /* Record a negative dependence distance to later limit the
 441              amount of stmt copying / unrolling we can perform.
 442              Only need to handle read-after-write dependence.  */
 443           if (DR_IS_READ (drb)
 444               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 445                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 446             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 447           continue;
 448         }
 449
 450       unsigned int abs_dist = abs (dist);
 451       if (abs_dist >= 2 && abs_dist < *max_vf)
 452         {
 453           /* The dependence distance requires reduction of the maximal
 454              vectorization factor.  */
 455           *max_vf = abs (dist);
 456           if (dump_enabled_p ())
 457             dump_printf_loc (MSG_NOTE, vect_location,
 458                              "adjusting maximal vectorization factor to %i\n",
 459                              *max_vf);
 460         }
 461
 462       if (abs_dist >= *max_vf)
 463         {
 464           /* Dependence distance does not create dependence, as far as
 465              vectorization is concerned, in this case.  */
 466           if (dump_enabled_p ())
 467             dump_printf_loc (MSG_NOTE, vect_location,
 468                              "dependence distance >= VF.\n");
 469           continue;
 470         }
 471
 472       if (dump_enabled_p ())
 473         {
 474           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                        "not vectorized, possible dependence "
 476                        "between data-refs ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 478           dump_printf (MSG_NOTE,  " and ");
 479           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 480           dump_printf (MSG_NOTE,  "\n");
 481         }
 482
 483       return true;
 484     }
 485
 486   return false;
 487 }
 488
 489 /* Function vect_analyze_data_ref_dependences.
 490
 491    Examine all the data references in the loop, and make sure there do not
 492    exist any data dependences between them.  Set *MAX_VF according to
 493    the maximum vectorization factor the data dependences allow.  */
 494
 495 bool
 496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 497                                    unsigned int *max_vf)
 498 {
 499   unsigned int i;
 500   struct data_dependence_relation *ddr;
 501
 502   if (dump_enabled_p ())
 503     dump_printf_loc (MSG_NOTE, vect_location,
 504                      "=== vect_analyze_data_ref_dependences ===\n");
 505
 506   LOOP_VINFO_DDRS (loop_vinfo)
 507     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 508              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 509   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 510   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 511   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 512                                 &LOOP_VINFO_DDRS (loop_vinfo),
 513                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 514     return false;
 515
 516   /* For epilogues we either have no aliases or alias versioning
 517      was applied to original loop.  Therefore we may just get max_vf
 518      using VF of original loop.  */
 519   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 520     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 521   else
 522     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 523       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 524         return false;
 525
 526   return true;
 527 }
 528
 529
 530 /* Function vect_slp_analyze_data_ref_dependence.
 531
 532    Return TRUE if there (might) exist a dependence between a memory-reference
 533    DRA and a memory-reference DRB.  When versioning for alias may check a
 534    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 535    the data dependence.  */
 536
 537 static bool
 538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 539 {
 540   struct data_reference *dra = DDR_A (ddr);
 541   struct data_reference *drb = DDR_B (ddr);
 542
 543   /* We need to check dependences of statements marked as unvectorizable
 544      as well, they still can prohibit vectorization.  */
 545
 546   /* Independent data accesses.  */
 547   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 548     return false;
 549
 550   if (dra == drb)
 551     return false;
 552
 553   /* Read-read is OK.  */
 554   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 555     return false;
 556
 557   /* If dra and drb are part of the same interleaving chain consider
 558      them independent.  */
 559   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 560       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 561           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 562     return false;
 563
 564   /* Unknown data dependence.  */
 565   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 566     {
 567       if  (dump_enabled_p ())
 568         {
 569           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 570                            "can't determine dependence between ");
 571           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 572           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 573           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 574           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 575         }
 576     }
 577   else if (dump_enabled_p ())
 578     {
 579       dump_printf_loc (MSG_NOTE, vect_location,
 580                        "determined dependence between ");
 581       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 582       dump_printf (MSG_NOTE, " and ");
 583       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 584       dump_printf (MSG_NOTE,  "\n");
 585     }
 586
 587   return true;
 588 }
 589
 590
 591 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 592    contain the vector of scalar stores of this instance if we are
 593    disambiguating the loads.  */
 594
 595 static bool
 596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 597                                    vec<gimple *> stores, gimple *last_store)
 598 {
 599   /* This walks over all stmts involved in the SLP load/store done
 600      in NODE verifying we can sink them up to the last stmt in the
 601      group.  */
 602   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 603   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 604     {
 605       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 606       if (access == last_access)
 607         continue;
 608       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 609       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 610            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 611         {
 612           gimple *stmt = gsi_stmt (gsi);
 613           if (! gimple_vuse (stmt)
 614               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 615             continue;
 616
 617           /* If we couldn't record a (single) data reference for this
 618              stmt we have to give up.  */
 619           /* ???  Here and below if dependence analysis fails we can resort
 620              to the alias oracle which can handle more kinds of stmts.  */
 621           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 622           if (!dr_b)
 623             return false;
 624
 625           bool dependent = false;
 626           /* If we run into a store of this same instance (we've just
 627              marked those) then delay dependence checking until we run
 628              into the last store because this is where it will have
 629              been sunk to (and we verify if we can do that as well).  */
 630           if (gimple_visited_p (stmt))
 631             {
 632               if (stmt != last_store)
 633                 continue;
 634               unsigned i;
 635               gimple *store;
 636               FOR_EACH_VEC_ELT (stores, i, store)
 637                 {
 638                   data_reference *store_dr
 639                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 640                   ddr_p ddr = initialize_data_dependence_relation
 641                                 (dr_a, store_dr, vNULL);
 642                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 643                   free_dependence_relation (ddr);
 644                   if (dependent)
 645                     break;
 646                 }
 647             }
 648           else
 649             {
 650               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 651                                                                dr_b, vNULL);
 652               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 653               free_dependence_relation (ddr);
 654             }
 655           if (dependent)
 656             return false;
 657         }
 658     }
 659   return true;
 660 }
 661
 662
 663 /* Function vect_analyze_data_ref_dependences.
 664
 665    Examine all the data references in the basic-block, and make sure there
 666    do not exist any data dependences between them.  Set *MAX_VF according to
 667    the maximum vectorization factor the data dependences allow.  */
 668
 669 bool
 670 vect_slp_analyze_instance_dependence (slp_instance instance)
 671 {
 672   if (dump_enabled_p ())
 673     dump_printf_loc (MSG_NOTE, vect_location,
 674                      "=== vect_slp_analyze_instance_dependence ===\n");
 675
 676   /* The stores of this instance are at the root of the SLP tree.  */
 677   slp_tree store = SLP_INSTANCE_TREE (instance);
 678   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 679     store = NULL;
 680
 681   /* Verify we can sink stores to the vectorized stmt insert location.  */
 682   gimple *last_store = NULL;
 683   if (store)
 684     {
 685       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 686         return false;
 687
 688       /* Mark stores in this instance and remember the last one.  */
 689       last_store = vect_find_last_scalar_stmt_in_slp (store);
 690       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 691         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 692     }
 693
 694   bool res = true;
 695
 696   /* Verify we can sink loads to the vectorized stmt insert location,
 697      special-casing stores of this instance.  */
 698   slp_tree load;
 699   unsigned int i;
 700   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 701     if (! vect_slp_analyze_node_dependences (instance, load,
 702                                              store
 703                                              ? SLP_TREE_SCALAR_STMTS (store)
 704                                              : vNULL, last_store))
 705       {
 706         res = false;
 707         break;
 708       }
 709
 710   /* Unset the visited flag.  */
 711   if (store)
 712     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 713       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 714
 715   return res;
 716 }
 717
 718 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 719    the statement that contains DRB, which is useful for recording in the
 720    dump file.  */
 721
 722 static void
 723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 724                             innermost_loop_behavior *drb)
 725 {
 726   bool existed;
 727   innermost_loop_behavior *&entry
 728     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 729   if (!existed || entry->base_alignment < drb->base_alignment)
 730     {
 731       entry = drb;
 732       if (dump_enabled_p ())
 733         {
 734           dump_printf_loc (MSG_NOTE, vect_location,
 735                            "recording new base alignment for ");
 736           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 737           dump_printf (MSG_NOTE, "\n");
 738           dump_printf_loc (MSG_NOTE, vect_location,
 739                            "  alignment:    %d\n", drb->base_alignment);
 740           dump_printf_loc (MSG_NOTE, vect_location,
 741                            "  misalignment: %d\n", drb->base_misalignment);
 742           dump_printf_loc (MSG_NOTE, vect_location,
 743                            "  based on:     ");
 744           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 745         }
 746     }
 747 }
 748
 749 /* If the region we're going to vectorize is reached, all unconditional
 750    data references occur at least once.  We can therefore pool the base
 751    alignment guarantees from each unconditional reference.  Do this by
 752    going through all the data references in VINFO and checking whether
 753    the containing statement makes the reference unconditionally.  If so,
 754    record the alignment of the base address in VINFO so that it can be
 755    used for all other references with the same base.  */
 756
 757 void
 758 vect_record_base_alignments (vec_info *vinfo)
 759 {
 760   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 761   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 762   data_reference *dr;
 763   unsigned int i;
 764   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 765     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 766       {
 767         gimple *stmt = DR_STMT (dr);
 768         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 769
 770         /* If DR is nested in the loop that is being vectorized, we can also
 771            record the alignment of the base wrt the outer loop.  */
 772         if (loop && nested_in_vect_loop_p (loop, stmt))
 773           {
 774             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 775             vect_record_base_alignment
 776               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 777           }
 778       }
 779 }
 780
 781 /* Return the target alignment for the vectorized form of DR.  */
 782
 783 static unsigned int
 784 vect_calculate_target_alignment (struct data_reference *dr)
 785 {
 786   gimple *stmt = DR_STMT (dr);
 787   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 788   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 789   return targetm.vectorize.preferred_vector_alignment (vectype);
 790 }
 791
 792 /* Function vect_compute_data_ref_alignment
 793
 794    Compute the misalignment of the data reference DR.
 795
 796    Output:
 797    1. If during the misalignment computation it is found that the data reference
 798       cannot be vectorized then false is returned.
 799    2. DR_MISALIGNMENT (DR) is defined.
 800
 801    FOR NOW: No analysis is actually performed. Misalignment is calculated
 802    only for trivial cases. TODO.  */
 803
 804 bool
 805 vect_compute_data_ref_alignment (struct data_reference *dr)
 806 {
 807   gimple *stmt = DR_STMT (dr);
 808   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 809   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 810   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 811   struct loop *loop = NULL;
 812   tree ref = DR_REF (dr);
 813   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 814
 815   if (dump_enabled_p ())
 816     dump_printf_loc (MSG_NOTE, vect_location,
 817                      "vect_compute_data_ref_alignment:\n");
 818
 819   if (loop_vinfo)
 820     loop = LOOP_VINFO_LOOP (loop_vinfo);
 821
 822   /* Initialize misalignment to unknown.  */
 823   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 824
 825   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 826   bool step_preserves_misalignment_p;
 827
 828   unsigned HOST_WIDE_INT vector_alignment
 829     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 830   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 831
 832   /* No step for BB vectorization.  */
 833   if (!loop)
 834     {
 835       gcc_assert (integer_zerop (drb->step));
 836       step_preserves_misalignment_p = true;
 837     }
 838
 839   /* In case the dataref is in an inner-loop of the loop that is being
 840      vectorized (LOOP), we use the base and misalignment information
 841      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 842      stays the same throughout the execution of the inner-loop, which is why
 843      we have to check that the stride of the dataref in the inner-loop evenly
 844      divides by the vector alignment.  */
 845   else if (nested_in_vect_loop_p (loop, stmt))
 846     {
 847       step_preserves_misalignment_p
 848         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 849
 850       if (dump_enabled_p ())
 851         {
 852           if (step_preserves_misalignment_p)
 853             dump_printf_loc (MSG_NOTE, vect_location,
 854                              "inner step divides the vector alignment.\n");
 855           else
 856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 857                              "inner step doesn't divide the vector"
 858                              " alignment.\n");
 859         }
 860     }
 861
 862   /* Similarly we can only use base and misalignment information relative to
 863      an innermost loop if the misalignment stays the same throughout the
 864      execution of the loop.  As above, this is the case if the stride of
 865      the dataref evenly divides by the alignment.  */
 866   else
 867     {
 868       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 869       step_preserves_misalignment_p
 870         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 871
 872       if (!step_preserves_misalignment_p && dump_enabled_p ())
 873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 874                          "step doesn't divide the vector alignment.\n");
 875     }
 876
 877   unsigned int base_alignment = drb->base_alignment;
 878   unsigned int base_misalignment = drb->base_misalignment;
 879
 880   /* Calculate the maximum of the pooled base address alignment and the
 881      alignment that we can compute for DR itself.  */
 882   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 883   if (entry && base_alignment < (*entry)->base_alignment)
 884     {
 885       base_alignment = (*entry)->base_alignment;
 886       base_misalignment = (*entry)->base_misalignment;
 887     }
 888
 889   if (drb->offset_alignment < vector_alignment
 890       || !step_preserves_misalignment_p
 891       /* We need to know whether the step wrt the vectorized loop is
 892          negative when computing the starting misalignment below.  */
 893       || TREE_CODE (drb->step) != INTEGER_CST)
 894     {
 895       if (dump_enabled_p ())
 896         {
 897           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                            "Unknown alignment for access: ");
 899           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 900           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 901         }
 902       return true;
 903     }
 904
 905   if (base_alignment < vector_alignment)
 906     {
 907       tree base = drb->base_address;
 908       if (TREE_CODE (base) == ADDR_EXPR)
 909         base = TREE_OPERAND (base, 0);
 910       if (!vect_can_force_dr_alignment_p (base,
 911                                           vector_alignment * BITS_PER_UNIT))
 912         {
 913           if (dump_enabled_p ())
 914             {
 915               dump_printf_loc (MSG_NOTE, vect_location,
 916                                "can't force alignment of ref: ");
 917               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 918               dump_printf (MSG_NOTE, "\n");
 919             }
 920           return true;
 921         }
 922
 923       /* Force the alignment of the decl.
 924          NOTE: This is the only change to the code we make during
 925          the analysis phase, before deciding to vectorize the loop.  */
 926       if (dump_enabled_p ())
 927         {
 928           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 929           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 930           dump_printf (MSG_NOTE, "\n");
 931         }
 932
 933       DR_VECT_AUX (dr)->base_decl = base;
 934       DR_VECT_AUX (dr)->base_misaligned = true;
 935       base_misalignment = 0;
 936     }
 937   poly_int64 misalignment
 938     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 939
 940   /* If this is a backward running DR then first access in the larger
 941      vectype actually is N-1 elements before the address in the DR.
 942      Adjust misalign accordingly.  */
 943   if (tree_int_cst_sgn (drb->step) < 0)
 944     /* PLUS because STEP is negative.  */
 945     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 946                      * TREE_INT_CST_LOW (drb->step));
 947
 948   unsigned int const_misalignment;
 949   if (!known_misalignment (misalignment, vector_alignment,
 950                            &const_misalignment))
 951     {
 952       if (dump_enabled_p ())
 953         {
 954           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 955                            "Non-constant misalignment for access: ");
 956           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 957           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 958         }
 959       return true;
 960     }
 961
 962   SET_DR_MISALIGNMENT (dr, const_misalignment);
 963
 964   if (dump_enabled_p ())
 965     {
 966       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 967                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 968       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 969       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 970     }
 971
 972   return true;
 973 }
 974
 975 /* Function vect_update_misalignment_for_peel.
 976    Sets DR's misalignment
 977    - to 0 if it has the same alignment as DR_PEEL,
 978    - to the misalignment computed using NPEEL if DR's salignment is known,
 979    - to -1 (unknown) otherwise.
 980
 981    DR - the data reference whose misalignment is to be adjusted.
 982    DR_PEEL - the data reference whose misalignment is being made
 983              zero in the vector loop by the peel.
 984    NPEEL - the number of iterations in the peel loop if the misalignment
 985            of DR_PEEL is known at compile time.  */
 986
 987 static void
 988 vect_update_misalignment_for_peel (struct data_reference *dr,
 989                                    struct data_reference *dr_peel, int npeel)
 990 {
 991   unsigned int i;
 992   vec<dr_p> same_aligned_drs;
 993   struct data_reference *current_dr;
 994   int dr_size = vect_get_scalar_dr_size (dr);
 995   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
 996   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 997   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 998
 999  /* For interleaved data accesses the step in the loop must be multiplied by
1000      the size of the interleaving group.  */
1001   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1002     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1003   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1004     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1005
1006   /* It can be assumed that the data refs with the same alignment as dr_peel
1007      are aligned in the vector loop.  */
1008   same_aligned_drs
1009     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1010   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1011     {
1012       if (current_dr != dr)
1013         continue;
1014       gcc_assert (!known_alignment_for_access_p (dr)
1015                   || !known_alignment_for_access_p (dr_peel)
1016                   || (DR_MISALIGNMENT (dr) / dr_size
1017                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1018       SET_DR_MISALIGNMENT (dr, 0);
1019       return;
1020     }
1021
1022   if (known_alignment_for_access_p (dr)
1023       && known_alignment_for_access_p (dr_peel))
1024     {
1025       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1026       int misal = DR_MISALIGNMENT (dr);
1027       misal += negative ? -npeel * dr_size : npeel * dr_size;
1028       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1029       SET_DR_MISALIGNMENT (dr, misal);
1030       return;
1031     }
1032
1033   if (dump_enabled_p ())
1034     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1035                      "to unknown (-1).\n");
1036   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1037 }
1038
1039
1040 /* Function verify_data_ref_alignment
1041
1042    Return TRUE if DR can be handled with respect to alignment.  */
1043
1044 static bool
1045 verify_data_ref_alignment (data_reference_p dr)
1046 {
1047   enum dr_alignment_support supportable_dr_alignment
1048     = vect_supportable_dr_alignment (dr, false);
1049   if (!supportable_dr_alignment)
1050     {
1051       if (dump_enabled_p ())
1052         {
1053           if (DR_IS_READ (dr))
1054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                              "not vectorized: unsupported unaligned load.");
1056           else
1057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1058                              "not vectorized: unsupported unaligned "
1059                              "store.");
1060
1061           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1062                              DR_REF (dr));
1063           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1064         }
1065       return false;
1066     }
1067
1068   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1069     dump_printf_loc (MSG_NOTE, vect_location,
1070                      "Vectorizing an unaligned access.\n");
1071
1072   return true;
1073 }
1074
1075 /* Function vect_verify_datarefs_alignment
1076
1077    Return TRUE if all data references in the loop can be
1078    handled with respect to alignment.  */
1079
1080 bool
1081 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1082 {
1083   vec<data_reference_p> datarefs = vinfo->datarefs;
1084   struct data_reference *dr;
1085   unsigned int i;
1086
1087   FOR_EACH_VEC_ELT (datarefs, i, dr)
1088     {
1089       gimple *stmt = DR_STMT (dr);
1090       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1091
1092       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1093         continue;
1094
1095       /* For interleaving, only the alignment of the first access matters.   */
1096       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1097           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1098         continue;
1099
1100       /* Strided accesses perform only component accesses, alignment is
1101          irrelevant for them.  */
1102       if (STMT_VINFO_STRIDED_P (stmt_info)
1103           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1104         continue;
1105
1106       if (! verify_data_ref_alignment (dr))
1107         return false;
1108     }
1109
1110   return true;
1111 }
1112
1113 /* Given an memory reference EXP return whether its alignment is less
1114    than its size.  */
1115
1116 static bool
1117 not_size_aligned (tree exp)
1118 {
1119   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1120     return true;
1121
1122   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1123           > get_object_alignment (exp));
1124 }
1125
1126 /* Function vector_alignment_reachable_p
1127
1128    Return true if vector alignment for DR is reachable by peeling
1129    a few loop iterations.  Return false otherwise.  */
1130
1131 static bool
1132 vector_alignment_reachable_p (struct data_reference *dr)
1133 {
1134   gimple *stmt = DR_STMT (dr);
1135   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1136   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1137
1138   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1139     {
1140       /* For interleaved access we peel only if number of iterations in
1141          the prolog loop ({VF - misalignment}), is a multiple of the
1142          number of the interleaved accesses.  */
1143       int elem_size, mis_in_elements;
1144
1145       /* FORNOW: handle only known alignment.  */
1146       if (!known_alignment_for_access_p (dr))
1147         return false;
1148
1149       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1150       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1151       elem_size = vector_element_size (vector_size, nelements);
1152       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1153
1154       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1155         return false;
1156     }
1157
1158   /* If misalignment is known at the compile time then allow peeling
1159      only if natural alignment is reachable through peeling.  */
1160   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1161     {
1162       HOST_WIDE_INT elmsize =
1163                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1164       if (dump_enabled_p ())
1165         {
1166           dump_printf_loc (MSG_NOTE, vect_location,
1167                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1168           dump_printf (MSG_NOTE,
1169                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1170         }
1171       if (DR_MISALIGNMENT (dr) % elmsize)
1172         {
1173           if (dump_enabled_p ())
1174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1175                              "data size does not divide the misalignment.\n");
1176           return false;
1177         }
1178     }
1179
1180   if (!known_alignment_for_access_p (dr))
1181     {
1182       tree type = TREE_TYPE (DR_REF (dr));
1183       bool is_packed = not_size_aligned (DR_REF (dr));
1184       if (dump_enabled_p ())
1185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                          "Unknown misalignment, %snaturally aligned\n",
1187                          is_packed ? "not " : "");
1188       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1189     }
1190
1191   return true;
1192 }
1193
1194
1195 /* Calculate the cost of the memory access represented by DR.  */
1196
1197 static void
1198 vect_get_data_access_cost (struct data_reference *dr,
1199                            unsigned int *inside_cost,
1200                            unsigned int *outside_cost,
1201                            stmt_vector_for_cost *body_cost_vec)
1202 {
1203   gimple *stmt = DR_STMT (dr);
1204   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1205   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1206   int ncopies;
1207
1208   if (PURE_SLP_STMT (stmt_info))
1209     ncopies = 1;
1210   else
1211     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1212
1213   if (DR_IS_READ (dr))
1214     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1215                         NULL, body_cost_vec, false);
1216   else
1217     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1218
1219   if (dump_enabled_p ())
1220     dump_printf_loc (MSG_NOTE, vect_location,
1221                      "vect_get_data_access_cost: inside_cost = %d, "
1222                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1223 }
1224
1225
1226 typedef struct _vect_peel_info
1227 {
1228   struct data_reference *dr;
1229   int npeel;
1230   unsigned int count;
1231 } *vect_peel_info;
1232
1233 typedef struct _vect_peel_extended_info
1234 {
1235   struct _vect_peel_info peel_info;
1236   unsigned int inside_cost;
1237   unsigned int outside_cost;
1238 } *vect_peel_extended_info;
1239
1240
1241 /* Peeling hashtable helpers.  */
1242
1243 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1244 {
1245   static inline hashval_t hash (const _vect_peel_info *);
1246   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1247 };
1248
1249 inline hashval_t
1250 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1251 {
1252   return (hashval_t) peel_info->npeel;
1253 }
1254
1255 inline bool
1256 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1257 {
1258   return (a->npeel == b->npeel);
1259 }
1260
1261
1262 /* Insert DR into peeling hash table with NPEEL as key.  */
1263
1264 static void
1265 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1266                           loop_vec_info loop_vinfo, struct data_reference *dr,
1267                           int npeel)
1268 {
1269   struct _vect_peel_info elem, *slot;
1270   _vect_peel_info **new_slot;
1271   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1272
1273   elem.npeel = npeel;
1274   slot = peeling_htab->find (&elem);
1275   if (slot)
1276     slot->count++;
1277   else
1278     {
1279       slot = XNEW (struct _vect_peel_info);
1280       slot->npeel = npeel;
1281       slot->dr = dr;
1282       slot->count = 1;
1283       new_slot = peeling_htab->find_slot (slot, INSERT);
1284       *new_slot = slot;
1285     }
1286
1287   if (!supportable_dr_alignment
1288       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1289     slot->count += VECT_MAX_COST;
1290 }
1291
1292
1293 /* Traverse peeling hash table to find peeling option that aligns maximum
1294    number of data accesses.  */
1295
1296 int
1297 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1298                                      _vect_peel_extended_info *max)
1299 {
1300   vect_peel_info elem = *slot;
1301
1302   if (elem->count > max->peel_info.count
1303       || (elem->count == max->peel_info.count
1304           && max->peel_info.npeel > elem->npeel))
1305     {
1306       max->peel_info.npeel = elem->npeel;
1307       max->peel_info.count = elem->count;
1308       max->peel_info.dr = elem->dr;
1309     }
1310
1311   return 1;
1312 }
1313
1314 /* Get the costs of peeling NPEEL iterations checking data access costs
1315    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1316    misalignment will be zero after peeling.  */
1317
1318 static void
1319 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1320                                 struct data_reference *dr0,
1321                                 unsigned int *inside_cost,
1322                                 unsigned int *outside_cost,
1323                                 stmt_vector_for_cost *body_cost_vec,
1324                                 unsigned int npeel,
1325                                 bool unknown_misalignment)
1326 {
1327   unsigned i;
1328   data_reference *dr;
1329
1330   FOR_EACH_VEC_ELT (datarefs, i, dr)
1331     {
1332       gimple *stmt = DR_STMT (dr);
1333       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1334       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1335         continue;
1336
1337       /* For interleaving, only the alignment of the first access
1338          matters.  */
1339       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1340           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1341         continue;
1342
1343       /* Strided accesses perform only component accesses, alignment is
1344          irrelevant for them.  */
1345       if (STMT_VINFO_STRIDED_P (stmt_info)
1346           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1347         continue;
1348
1349       int save_misalignment;
1350       save_misalignment = DR_MISALIGNMENT (dr);
1351       if (npeel == 0)
1352         ;
1353       else if (unknown_misalignment && dr == dr0)
1354         SET_DR_MISALIGNMENT (dr, 0);
1355       else
1356         vect_update_misalignment_for_peel (dr, dr0, npeel);
1357       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1358                                  body_cost_vec);
1359       SET_DR_MISALIGNMENT (dr, save_misalignment);
1360     }
1361 }
1362
1363 /* Traverse peeling hash table and calculate cost for each peeling option.
1364    Find the one with the lowest cost.  */
1365
1366 int
1367 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1368                                    _vect_peel_extended_info *min)
1369 {
1370   vect_peel_info elem = *slot;
1371   int dummy;
1372   unsigned int inside_cost = 0, outside_cost = 0;
1373   gimple *stmt = DR_STMT (elem->dr);
1374   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1375   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1376   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1377                        epilogue_cost_vec;
1378
1379   prologue_cost_vec.create (2);
1380   body_cost_vec.create (2);
1381   epilogue_cost_vec.create (2);
1382
1383   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1384                                   elem->dr, &inside_cost, &outside_cost,
1385                                   &body_cost_vec, elem->npeel, false);
1386
1387   body_cost_vec.release ();
1388
1389   outside_cost += vect_get_known_peeling_cost
1390     (loop_vinfo, elem->npeel, &dummy,
1391      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1392      &prologue_cost_vec, &epilogue_cost_vec);
1393
1394   /* Prologue and epilogue costs are added to the target model later.
1395      These costs depend only on the scalar iteration cost, the
1396      number of peeling iterations finally chosen, and the number of
1397      misaligned statements.  So discard the information found here.  */
1398   prologue_cost_vec.release ();
1399   epilogue_cost_vec.release ();
1400
1401   if (inside_cost < min->inside_cost
1402       || (inside_cost == min->inside_cost
1403           && outside_cost < min->outside_cost))
1404     {
1405       min->inside_cost = inside_cost;
1406       min->outside_cost = outside_cost;
1407       min->peel_info.dr = elem->dr;
1408       min->peel_info.npeel = elem->npeel;
1409       min->peel_info.count = elem->count;
1410     }
1411
1412   return 1;
1413 }
1414
1415
1416 /* Choose best peeling option by traversing peeling hash table and either
1417    choosing an option with the lowest cost (if cost model is enabled) or the
1418    option that aligns as many accesses as possible.  */
1419
1420 static struct _vect_peel_extended_info
1421 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1422                                        loop_vec_info loop_vinfo)
1423 {
1424    struct _vect_peel_extended_info res;
1425
1426    res.peel_info.dr = NULL;
1427
1428    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1429      {
1430        res.inside_cost = INT_MAX;
1431        res.outside_cost = INT_MAX;
1432        peeling_htab->traverse <_vect_peel_extended_info *,
1433                                vect_peeling_hash_get_lowest_cost> (&res);
1434      }
1435    else
1436      {
1437        res.peel_info.count = 0;
1438        peeling_htab->traverse <_vect_peel_extended_info *,
1439                                vect_peeling_hash_get_most_frequent> (&res);
1440        res.inside_cost = 0;
1441        res.outside_cost = 0;
1442      }
1443
1444    return res;
1445 }
1446
1447 /* Return true if the new peeling NPEEL is supported.  */
1448
1449 static bool
1450 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1451                           unsigned npeel)
1452 {
1453   unsigned i;
1454   struct data_reference *dr = NULL;
1455   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1456   gimple *stmt;
1457   stmt_vec_info stmt_info;
1458   enum dr_alignment_support supportable_dr_alignment;
1459
1460   /* Ensure that all data refs can be vectorized after the peel.  */
1461   FOR_EACH_VEC_ELT (datarefs, i, dr)
1462     {
1463       int save_misalignment;
1464
1465       if (dr == dr0)
1466         continue;
1467
1468       stmt = DR_STMT (dr);
1469       stmt_info = vinfo_for_stmt (stmt);
1470       /* For interleaving, only the alignment of the first access
1471          matters.  */
1472       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1473           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1474         continue;
1475
1476       /* Strided accesses perform only component accesses, alignment is
1477          irrelevant for them.  */
1478       if (STMT_VINFO_STRIDED_P (stmt_info)
1479           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1480         continue;
1481
1482       save_misalignment = DR_MISALIGNMENT (dr);
1483       vect_update_misalignment_for_peel (dr, dr0, npeel);
1484       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1485       SET_DR_MISALIGNMENT (dr, save_misalignment);
1486
1487       if (!supportable_dr_alignment)
1488         return false;
1489     }
1490
1491   return true;
1492 }
1493
1494 /* Function vect_enhance_data_refs_alignment
1495
1496    This pass will use loop versioning and loop peeling in order to enhance
1497    the alignment of data references in the loop.
1498
1499    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1500    original loop is to be vectorized.  Any other loops that are created by
1501    the transformations performed in this pass - are not supposed to be
1502    vectorized.  This restriction will be relaxed.
1503
1504    This pass will require a cost model to guide it whether to apply peeling
1505    or versioning or a combination of the two.  For example, the scheme that
1506    intel uses when given a loop with several memory accesses, is as follows:
1507    choose one memory access ('p') which alignment you want to force by doing
1508    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1509    other accesses are not necessarily aligned, or (2) use loop versioning to
1510    generate one loop in which all accesses are aligned, and another loop in
1511    which only 'p' is necessarily aligned.
1512
1513    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1514    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1515    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1516
1517    Devising a cost model is the most critical aspect of this work.  It will
1518    guide us on which access to peel for, whether to use loop versioning, how
1519    many versions to create, etc.  The cost model will probably consist of
1520    generic considerations as well as target specific considerations (on
1521    powerpc for example, misaligned stores are more painful than misaligned
1522    loads).
1523
1524    Here are the general steps involved in alignment enhancements:
1525
1526      -- original loop, before alignment analysis:
1527         for (i=0; i<N; i++){
1528           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1529           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1530         }
1531
1532      -- After vect_compute_data_refs_alignment:
1533         for (i=0; i<N; i++){
1534           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1535           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1536         }
1537
1538      -- Possibility 1: we do loop versioning:
1539      if (p is aligned) {
1540         for (i=0; i<N; i++){    # loop 1A
1541           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1542           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1543         }
1544      }
1545      else {
1546         for (i=0; i<N; i++){    # loop 1B
1547           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1548           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1549         }
1550      }
1551
1552      -- Possibility 2: we do loop peeling:
1553      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1554         x = q[i];
1555         p[i] = y;
1556      }
1557      for (i = 3; i < N; i++){   # loop 2A
1558         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1559         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1560      }
1561
1562      -- Possibility 3: combination of loop peeling and versioning:
1563      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1564         x = q[i];
1565         p[i] = y;
1566      }
1567      if (p is aligned) {
1568         for (i = 3; i<N; i++){  # loop 3A
1569           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1570           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1571         }
1572      }
1573      else {
1574         for (i = 3; i<N; i++){  # loop 3B
1575           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1576           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1577         }
1578      }
1579
1580      These loops are later passed to loop_transform to be vectorized.  The
1581      vectorizer will use the alignment information to guide the transformation
1582      (whether to generate regular loads/stores, or with special handling for
1583      misalignment).  */
1584
1585 bool
1586 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1587 {
1588   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1589   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1590   enum dr_alignment_support supportable_dr_alignment;
1591   struct data_reference *dr0 = NULL, *first_store = NULL;
1592   struct data_reference *dr;
1593   unsigned int i, j;
1594   bool do_peeling = false;
1595   bool do_versioning = false;
1596   bool stat;
1597   gimple *stmt;
1598   stmt_vec_info stmt_info;
1599   unsigned int npeel = 0;
1600   bool one_misalignment_known = false;
1601   bool one_misalignment_unknown = false;
1602   bool one_dr_unsupportable = false;
1603   struct data_reference *unsupportable_dr = NULL;
1604   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1605   unsigned possible_npeel_number = 1;
1606   tree vectype;
1607   unsigned int mis, same_align_drs_max = 0;
1608   hash_table<peel_info_hasher> peeling_htab (1);
1609
1610   if (dump_enabled_p ())
1611     dump_printf_loc (MSG_NOTE, vect_location,
1612                      "=== vect_enhance_data_refs_alignment ===\n");
1613
1614   /* Reset data so we can safely be called multiple times.  */
1615   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1616   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1617
1618   /* While cost model enhancements are expected in the future, the high level
1619      view of the code at this time is as follows:
1620
1621      A) If there is a misaligned access then see if peeling to align
1622         this access can make all data references satisfy
1623         vect_supportable_dr_alignment.  If so, update data structures
1624         as needed and return true.
1625
1626      B) If peeling wasn't possible and there is a data reference with an
1627         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1628         then see if loop versioning checks can be used to make all data
1629         references satisfy vect_supportable_dr_alignment.  If so, update
1630         data structures as needed and return true.
1631
1632      C) If neither peeling nor versioning were successful then return false if
1633         any data reference does not satisfy vect_supportable_dr_alignment.
1634
1635      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1636
1637      Note, Possibility 3 above (which is peeling and versioning together) is not
1638      being done at this time.  */
1639
1640   /* (1) Peeling to force alignment.  */
1641
1642   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1643      Considerations:
1644      + How many accesses will become aligned due to the peeling
1645      - How many accesses will become unaligned due to the peeling,
1646        and the cost of misaligned accesses.
1647      - The cost of peeling (the extra runtime checks, the increase
1648        in code size).  */
1649
1650   FOR_EACH_VEC_ELT (datarefs, i, dr)
1651     {
1652       stmt = DR_STMT (dr);
1653       stmt_info = vinfo_for_stmt (stmt);
1654
1655       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1656         continue;
1657
1658       /* For interleaving, only the alignment of the first access
1659          matters.  */
1660       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1661           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1662         continue;
1663
1664       /* For invariant accesses there is nothing to enhance.  */
1665       if (integer_zerop (DR_STEP (dr)))
1666         continue;
1667
1668       /* Strided accesses perform only component accesses, alignment is
1669          irrelevant for them.  */
1670       if (STMT_VINFO_STRIDED_P (stmt_info)
1671           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1672         continue;
1673
1674       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1675       do_peeling = vector_alignment_reachable_p (dr);
1676       if (do_peeling)
1677         {
1678           if (known_alignment_for_access_p (dr))
1679             {
1680               unsigned int npeel_tmp = 0;
1681               bool negative = tree_int_cst_compare (DR_STEP (dr),
1682                                                     size_zero_node) < 0;
1683
1684               vectype = STMT_VINFO_VECTYPE (stmt_info);
1685               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1686               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1687               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1688               if (DR_MISALIGNMENT (dr) != 0)
1689                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1690
1691               /* For multiple types, it is possible that the bigger type access
1692                  will have more than one peeling option.  E.g., a loop with two
1693                  types: one of size (vector size / 4), and the other one of
1694                  size (vector size / 8).  Vectorization factor will 8.  If both
1695                  accesses are misaligned by 3, the first one needs one scalar
1696                  iteration to be aligned, and the second one needs 5.  But the
1697                  first one will be aligned also by peeling 5 scalar
1698                  iterations, and in that case both accesses will be aligned.
1699                  Hence, except for the immediate peeling amount, we also want
1700                  to try to add full vector size, while we don't exceed
1701                  vectorization factor.
1702                  We do this automatically for cost model, since we calculate
1703                  cost for every peeling option.  */
1704               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1705                 {
1706                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1707                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1708                   possible_npeel_number
1709                     = vect_get_num_vectors (nscalars, vectype);
1710
1711                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1712                      allow peeling NELEMENTS.  */
1713                   if (DR_MISALIGNMENT (dr) == 0)
1714                     possible_npeel_number++;
1715                 }
1716
1717               /* Save info about DR in the hash table.  Also include peeling
1718                  amounts according to the explanation above.  */
1719               for (j = 0; j < possible_npeel_number; j++)
1720                 {
1721                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1722                                             dr, npeel_tmp);
1723                   npeel_tmp += target_align / dr_size;
1724                 }
1725
1726               one_misalignment_known = true;
1727             }
1728           else
1729             {
1730               /* If we don't know any misalignment values, we prefer
1731                  peeling for data-ref that has the maximum number of data-refs
1732                  with the same alignment, unless the target prefers to align
1733                  stores over load.  */
1734               unsigned same_align_drs
1735                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1736               if (!dr0
1737                   || same_align_drs_max < same_align_drs)
1738                 {
1739                   same_align_drs_max = same_align_drs;
1740                   dr0 = dr;
1741                 }
1742               /* For data-refs with the same number of related
1743                  accesses prefer the one where the misalign
1744                  computation will be invariant in the outermost loop.  */
1745               else if (same_align_drs_max == same_align_drs)
1746                 {
1747                   struct loop *ivloop0, *ivloop;
1748                   ivloop0 = outermost_invariant_loop_for_expr
1749                     (loop, DR_BASE_ADDRESS (dr0));
1750                   ivloop = outermost_invariant_loop_for_expr
1751                     (loop, DR_BASE_ADDRESS (dr));
1752                   if ((ivloop && !ivloop0)
1753                       || (ivloop && ivloop0
1754                           && flow_loop_nested_p (ivloop, ivloop0)))
1755                     dr0 = dr;
1756                 }
1757
1758               one_misalignment_unknown = true;
1759
1760               /* Check for data refs with unsupportable alignment that
1761                  can be peeled.  */
1762               if (!supportable_dr_alignment)
1763               {
1764                 one_dr_unsupportable = true;
1765                 unsupportable_dr = dr;
1766               }
1767
1768               if (!first_store && DR_IS_WRITE (dr))
1769                 first_store = dr;
1770             }
1771         }
1772       else
1773         {
1774           if (!aligned_access_p (dr))
1775             {
1776               if (dump_enabled_p ())
1777                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778                                  "vector alignment may not be reachable\n");
1779               break;
1780             }
1781         }
1782     }
1783
1784   /* Check if we can possibly peel the loop.  */
1785   if (!vect_can_advance_ivs_p (loop_vinfo)
1786       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1787       || loop->inner)
1788     do_peeling = false;
1789
1790   struct _vect_peel_extended_info peel_for_known_alignment;
1791   struct _vect_peel_extended_info peel_for_unknown_alignment;
1792   struct _vect_peel_extended_info best_peel;
1793
1794   peel_for_unknown_alignment.inside_cost = INT_MAX;
1795   peel_for_unknown_alignment.outside_cost = INT_MAX;
1796   peel_for_unknown_alignment.peel_info.count = 0;
1797
1798   if (do_peeling
1799       && one_misalignment_unknown)
1800     {
1801       /* Check if the target requires to prefer stores over loads, i.e., if
1802          misaligned stores are more expensive than misaligned loads (taking
1803          drs with same alignment into account).  */
1804       unsigned int load_inside_cost = 0;
1805       unsigned int load_outside_cost = 0;
1806       unsigned int store_inside_cost = 0;
1807       unsigned int store_outside_cost = 0;
1808       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1809
1810       stmt_vector_for_cost dummy;
1811       dummy.create (2);
1812       vect_get_peeling_costs_all_drs (datarefs, dr0,
1813                                       &load_inside_cost,
1814                                       &load_outside_cost,
1815                                       &dummy, estimated_npeels, true);
1816       dummy.release ();
1817
1818       if (first_store)
1819         {
1820           dummy.create (2);
1821           vect_get_peeling_costs_all_drs (datarefs, first_store,
1822                                           &store_inside_cost,
1823                                           &store_outside_cost,
1824                                           &dummy, estimated_npeels, true);
1825           dummy.release ();
1826         }
1827       else
1828         {
1829           store_inside_cost = INT_MAX;
1830           store_outside_cost = INT_MAX;
1831         }
1832
1833       if (load_inside_cost > store_inside_cost
1834           || (load_inside_cost == store_inside_cost
1835               && load_outside_cost > store_outside_cost))
1836         {
1837           dr0 = first_store;
1838           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1839           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1840         }
1841       else
1842         {
1843           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1844           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1845         }
1846
1847       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1848       prologue_cost_vec.create (2);
1849       epilogue_cost_vec.create (2);
1850
1851       int dummy2;
1852       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1853         (loop_vinfo, estimated_npeels, &dummy2,
1854          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1855          &prologue_cost_vec, &epilogue_cost_vec);
1856
1857       prologue_cost_vec.release ();
1858       epilogue_cost_vec.release ();
1859
1860       peel_for_unknown_alignment.peel_info.count = 1
1861         + STMT_VINFO_SAME_ALIGN_REFS
1862         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1863     }
1864
1865   peel_for_unknown_alignment.peel_info.npeel = 0;
1866   peel_for_unknown_alignment.peel_info.dr = dr0;
1867
1868   best_peel = peel_for_unknown_alignment;
1869
1870   peel_for_known_alignment.inside_cost = INT_MAX;
1871   peel_for_known_alignment.outside_cost = INT_MAX;
1872   peel_for_known_alignment.peel_info.count = 0;
1873   peel_for_known_alignment.peel_info.dr = NULL;
1874
1875   if (do_peeling && one_misalignment_known)
1876     {
1877       /* Peeling is possible, but there is no data access that is not supported
1878          unless aligned.  So we try to choose the best possible peeling from
1879          the hash table.  */
1880       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1881         (&peeling_htab, loop_vinfo);
1882     }
1883
1884   /* Compare costs of peeling for known and unknown alignment. */
1885   if (peel_for_known_alignment.peel_info.dr != NULL
1886       && peel_for_unknown_alignment.inside_cost
1887       >= peel_for_known_alignment.inside_cost)
1888     {
1889       best_peel = peel_for_known_alignment;
1890
1891       /* If the best peeling for known alignment has NPEEL == 0, perform no
1892          peeling at all except if there is an unsupportable dr that we can
1893          align.  */
1894       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1895         do_peeling = false;
1896     }
1897
1898   /* If there is an unsupportable data ref, prefer this over all choices so far
1899      since we'd have to discard a chosen peeling except when it accidentally
1900      aligned the unsupportable data ref.  */
1901   if (one_dr_unsupportable)
1902     dr0 = unsupportable_dr;
1903   else if (do_peeling)
1904     {
1905       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1906          TODO: Use nopeel_outside_cost or get rid of it?  */
1907       unsigned nopeel_inside_cost = 0;
1908       unsigned nopeel_outside_cost = 0;
1909
1910       stmt_vector_for_cost dummy;
1911       dummy.create (2);
1912       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1913                                       &nopeel_outside_cost, &dummy, 0, false);
1914       dummy.release ();
1915
1916       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1917          costs will be recorded.  */
1918       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1919       prologue_cost_vec.create (2);
1920       epilogue_cost_vec.create (2);
1921
1922       int dummy2;
1923       nopeel_outside_cost += vect_get_known_peeling_cost
1924         (loop_vinfo, 0, &dummy2,
1925          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1926          &prologue_cost_vec, &epilogue_cost_vec);
1927
1928       prologue_cost_vec.release ();
1929       epilogue_cost_vec.release ();
1930
1931       npeel = best_peel.peel_info.npeel;
1932       dr0 = best_peel.peel_info.dr;
1933
1934       /* If no peeling is not more expensive than the best peeling we
1935          have so far, don't perform any peeling.  */
1936       if (nopeel_inside_cost <= best_peel.inside_cost)
1937         do_peeling = false;
1938     }
1939
1940   if (do_peeling)
1941     {
1942       stmt = DR_STMT (dr0);
1943       stmt_info = vinfo_for_stmt (stmt);
1944       vectype = STMT_VINFO_VECTYPE (stmt_info);
1945
1946       if (known_alignment_for_access_p (dr0))
1947         {
1948           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1949                                                 size_zero_node) < 0;
1950           if (!npeel)
1951             {
1952               /* Since it's known at compile time, compute the number of
1953                  iterations in the peeled loop (the peeling factor) for use in
1954                  updating DR_MISALIGNMENT values.  The peeling factor is the
1955                  vectorization factor minus the misalignment as an element
1956                  count.  */
1957               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1958               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1959               npeel = ((mis & (target_align - 1))
1960                        / vect_get_scalar_dr_size (dr0));
1961             }
1962
1963           /* For interleaved data access every iteration accesses all the
1964              members of the group, therefore we divide the number of iterations
1965              by the group size.  */
1966           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1967           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1968             npeel /= GROUP_SIZE (stmt_info);
1969
1970           if (dump_enabled_p ())
1971             dump_printf_loc (MSG_NOTE, vect_location,
1972                              "Try peeling by %d\n", npeel);
1973         }
1974
1975       /* Ensure that all datarefs can be vectorized after the peel.  */
1976       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1977         do_peeling = false;
1978
1979       /* Check if all datarefs are supportable and log.  */
1980       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1981         {
1982           stat = vect_verify_datarefs_alignment (loop_vinfo);
1983           if (!stat)
1984             do_peeling = false;
1985           else
1986             return stat;
1987         }
1988
1989       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1990       if (do_peeling)
1991         {
1992           unsigned max_allowed_peel
1993             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1994           if (max_allowed_peel != (unsigned)-1)
1995             {
1996               unsigned max_peel = npeel;
1997               if (max_peel == 0)
1998                 {
1999                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2000                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2001                 }
2002               if (max_peel > max_allowed_peel)
2003                 {
2004                   do_peeling = false;
2005                   if (dump_enabled_p ())
2006                     dump_printf_loc (MSG_NOTE, vect_location,
2007                         "Disable peeling, max peels reached: %d\n", max_peel);
2008                 }
2009             }
2010         }
2011
2012       /* Cost model #2 - if peeling may result in a remaining loop not
2013          iterating enough to be vectorized then do not peel.  Since this
2014          is a cost heuristic rather than a correctness decision, use the
2015          most likely runtime value for variable vectorization factors.  */
2016       if (do_peeling
2017           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2018         {
2019           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2020           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2021           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2022               < assumed_vf + max_peel)
2023             do_peeling = false;
2024         }
2025
2026       if (do_peeling)
2027         {
2028           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2029              If the misalignment of DR_i is identical to that of dr0 then set
2030              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2031              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2032              by the peeling factor times the element size of DR_i (MOD the
2033              vectorization factor times the size).  Otherwise, the
2034              misalignment of DR_i must be set to unknown.  */
2035           FOR_EACH_VEC_ELT (datarefs, i, dr)
2036             if (dr != dr0)
2037               {
2038                 /* Strided accesses perform only component accesses, alignment
2039                    is irrelevant for them.  */
2040                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2041                 if (STMT_VINFO_STRIDED_P (stmt_info)
2042                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2043                   continue;
2044
2045                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2046               }
2047
2048           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2049           if (npeel)
2050             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2051           else
2052             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2053               = DR_MISALIGNMENT (dr0);
2054           SET_DR_MISALIGNMENT (dr0, 0);
2055           if (dump_enabled_p ())
2056             {
2057               dump_printf_loc (MSG_NOTE, vect_location,
2058                                "Alignment of access forced using peeling.\n");
2059               dump_printf_loc (MSG_NOTE, vect_location,
2060                                "Peeling for alignment will be applied.\n");
2061             }
2062
2063           /* The inside-loop cost will be accounted for in vectorizable_load
2064              and vectorizable_store correctly with adjusted alignments.
2065              Drop the body_cst_vec on the floor here.  */
2066           stat = vect_verify_datarefs_alignment (loop_vinfo);
2067           gcc_assert (stat);
2068           return stat;
2069         }
2070     }
2071
2072   /* (2) Versioning to force alignment.  */
2073
2074   /* Try versioning if:
2075      1) optimize loop for speed
2076      2) there is at least one unsupported misaligned data ref with an unknown
2077         misalignment, and
2078      3) all misaligned data refs with a known misalignment are supported, and
2079      4) the number of runtime alignment checks is within reason.  */
2080
2081   do_versioning =
2082         optimize_loop_nest_for_speed_p (loop)
2083         && (!loop->inner); /* FORNOW */
2084
2085   if (do_versioning)
2086     {
2087       FOR_EACH_VEC_ELT (datarefs, i, dr)
2088         {
2089           stmt = DR_STMT (dr);
2090           stmt_info = vinfo_for_stmt (stmt);
2091
2092           /* For interleaving, only the alignment of the first access
2093              matters.  */
2094           if (aligned_access_p (dr)
2095               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2096                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2097             continue;
2098
2099           if (STMT_VINFO_STRIDED_P (stmt_info))
2100             {
2101               /* Strided loads perform only component accesses, alignment is
2102                  irrelevant for them.  */
2103               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2104                 continue;
2105               do_versioning = false;
2106               break;
2107             }
2108
2109           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2110
2111           if (!supportable_dr_alignment)
2112             {
2113               gimple *stmt;
2114               int mask;
2115               tree vectype;
2116
2117               if (known_alignment_for_access_p (dr)
2118                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2119                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2120                 {
2121                   do_versioning = false;
2122                   break;
2123                 }
2124
2125               stmt = DR_STMT (dr);
2126               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2127               gcc_assert (vectype);
2128
2129               /* At present we don't support versioning for alignment
2130                  with variable VF, since there's no guarantee that the
2131                  VF is a power of two.  We could relax this if we added
2132                  a way of enforcing a power-of-two size.  */
2133               unsigned HOST_WIDE_INT size;
2134               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2135                 {
2136                   do_versioning = false;
2137                   break;
2138                 }
2139
2140               /* The rightmost bits of an aligned address must be zeros.
2141                  Construct the mask needed for this test.  For example,
2142                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2143                  mask must be 15 = 0xf. */
2144               mask = size - 1;
2145
2146               /* FORNOW: use the same mask to test all potentially unaligned
2147                  references in the loop.  The vectorizer currently supports
2148                  a single vector size, see the reference to
2149                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2150                  vectorization factor is computed.  */
2151               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2152                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2153               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2154               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2155                       DR_STMT (dr));
2156             }
2157         }
2158
2159       /* Versioning requires at least one misaligned data reference.  */
2160       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2161         do_versioning = false;
2162       else if (!do_versioning)
2163         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2164     }
2165
2166   if (do_versioning)
2167     {
2168       vec<gimple *> may_misalign_stmts
2169         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2170       gimple *stmt;
2171
2172       /* It can now be assumed that the data references in the statements
2173          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2174          of the loop being vectorized.  */
2175       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2176         {
2177           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2178           dr = STMT_VINFO_DATA_REF (stmt_info);
2179           SET_DR_MISALIGNMENT (dr, 0);
2180           if (dump_enabled_p ())
2181             dump_printf_loc (MSG_NOTE, vect_location,
2182                              "Alignment of access forced using versioning.\n");
2183         }
2184
2185       if (dump_enabled_p ())
2186         dump_printf_loc (MSG_NOTE, vect_location,
2187                          "Versioning for alignment will be applied.\n");
2188
2189       /* Peeling and versioning can't be done together at this time.  */
2190       gcc_assert (! (do_peeling && do_versioning));
2191
2192       stat = vect_verify_datarefs_alignment (loop_vinfo);
2193       gcc_assert (stat);
2194       return stat;
2195     }
2196
2197   /* This point is reached if neither peeling nor versioning is being done.  */
2198   gcc_assert (! (do_peeling || do_versioning));
2199
2200   stat = vect_verify_datarefs_alignment (loop_vinfo);
2201   return stat;
2202 }
2203
2204
2205 /* Function vect_find_same_alignment_drs.
2206
2207    Update group and alignment relations according to the chosen
2208    vectorization factor.  */
2209
2210 static void
2211 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2212 {
2213   struct data_reference *dra = DDR_A (ddr);
2214   struct data_reference *drb = DDR_B (ddr);
2215   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2216   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2217
2218   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2219     return;
2220
2221   if (dra == drb)
2222     return;
2223
2224   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2225       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2226       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2227     return;
2228
2229   /* Two references with distance zero have the same alignment.  */
2230   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2231                           - wi::to_poly_offset (DR_INIT (drb)));
2232   if (maybe_ne (diff, 0))
2233     {
2234       /* Get the wider of the two alignments.  */
2235       unsigned int align_a = (vect_calculate_target_alignment (dra)
2236                               / BITS_PER_UNIT);
2237       unsigned int align_b = (vect_calculate_target_alignment (drb)
2238                               / BITS_PER_UNIT);
2239       unsigned int max_align = MAX (align_a, align_b);
2240
2241       /* Require the gap to be a multiple of the larger vector alignment.  */
2242       if (!multiple_p (diff, max_align))
2243         return;
2244     }
2245
2246   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2247   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2248   if (dump_enabled_p ())
2249     {
2250       dump_printf_loc (MSG_NOTE, vect_location,
2251                        "accesses have the same alignment: ");
2252       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2253       dump_printf (MSG_NOTE,  " and ");
2254       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2255       dump_printf (MSG_NOTE, "\n");
2256     }
2257 }
2258
2259
2260 /* Function vect_analyze_data_refs_alignment
2261
2262    Analyze the alignment of the data-references in the loop.
2263    Return FALSE if a data reference is found that cannot be vectorized.  */
2264
2265 bool
2266 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2267 {
2268   if (dump_enabled_p ())
2269     dump_printf_loc (MSG_NOTE, vect_location,
2270                      "=== vect_analyze_data_refs_alignment ===\n");
2271
2272   /* Mark groups of data references with same alignment using
2273      data dependence information.  */
2274   vec<ddr_p> ddrs = vinfo->ddrs;
2275   struct data_dependence_relation *ddr;
2276   unsigned int i;
2277
2278   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2279     vect_find_same_alignment_drs (ddr);
2280
2281   vec<data_reference_p> datarefs = vinfo->datarefs;
2282   struct data_reference *dr;
2283
2284   vect_record_base_alignments (vinfo);
2285   FOR_EACH_VEC_ELT (datarefs, i, dr)
2286     {
2287       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2288       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2289           && !vect_compute_data_ref_alignment (dr))
2290         {
2291           /* Strided accesses perform only component accesses, misalignment
2292              information is irrelevant for them.  */
2293           if (STMT_VINFO_STRIDED_P (stmt_info)
2294               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2295             continue;
2296
2297           if (dump_enabled_p ())
2298             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299                              "not vectorized: can't calculate alignment "
2300                              "for data ref.\n");
2301
2302           return false;
2303         }
2304     }
2305
2306   return true;
2307 }
2308
2309
2310 /* Analyze alignment of DRs of stmts in NODE.  */
2311
2312 static bool
2313 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2314 {
2315   /* We vectorize from the first scalar stmt in the node unless
2316      the node is permuted in which case we start from the first
2317      element in the group.  */
2318   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2319   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2320   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2321     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2322
2323   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2324   if (! vect_compute_data_ref_alignment (dr)
2325       /* For creating the data-ref pointer we need alignment of the
2326          first element anyway.  */
2327       || (dr != first_dr
2328           && ! vect_compute_data_ref_alignment (first_dr))
2329       || ! verify_data_ref_alignment (dr))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                          "not vectorized: bad data alignment in basic "
2334                          "block.\n");
2335       return false;
2336     }
2337
2338   return true;
2339 }
2340
2341 /* Function vect_slp_analyze_instance_alignment
2342
2343    Analyze the alignment of the data-references in the SLP instance.
2344    Return FALSE if a data reference is found that cannot be vectorized.  */
2345
2346 bool
2347 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2348 {
2349   if (dump_enabled_p ())
2350     dump_printf_loc (MSG_NOTE, vect_location,
2351                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2352
2353   slp_tree node;
2354   unsigned i;
2355   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2356     if (! vect_slp_analyze_and_verify_node_alignment (node))
2357       return false;
2358
2359   node = SLP_INSTANCE_TREE (instance);
2360   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2361       && ! vect_slp_analyze_and_verify_node_alignment
2362              (SLP_INSTANCE_TREE (instance)))
2363     return false;
2364
2365   return true;
2366 }
2367
2368
2369 /* Analyze groups of accesses: check that DR belongs to a group of
2370    accesses of legal size, step, etc.  Detect gaps, single element
2371    interleaving, and other special cases. Set grouped access info.
2372    Collect groups of strided stores for further use in SLP analysis.
2373    Worker for vect_analyze_group_access.  */
2374
2375 static bool
2376 vect_analyze_group_access_1 (struct data_reference *dr)
2377 {
2378   tree step = DR_STEP (dr);
2379   tree scalar_type = TREE_TYPE (DR_REF (dr));
2380   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2381   gimple *stmt = DR_STMT (dr);
2382   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2383   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2384   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2385   HOST_WIDE_INT dr_step = -1;
2386   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2387   bool slp_impossible = false;
2388
2389   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2390      size of the interleaving group (including gaps).  */
2391   if (tree_fits_shwi_p (step))
2392     {
2393       dr_step = tree_to_shwi (step);
2394       /* Check that STEP is a multiple of type size.  Otherwise there is
2395          a non-element-sized gap at the end of the group which we
2396          cannot represent in GROUP_GAP or GROUP_SIZE.
2397          ???  As we can handle non-constant step fine here we should
2398          simply remove uses of GROUP_GAP between the last and first
2399          element and instead rely on DR_STEP.  GROUP_SIZE then would
2400          simply not include that gap.  */
2401       if ((dr_step % type_size) != 0)
2402         {
2403           if (dump_enabled_p ())
2404             {
2405               dump_printf_loc (MSG_NOTE, vect_location,
2406                                "Step ");
2407               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2408               dump_printf (MSG_NOTE,
2409                            " is not a multiple of the element size for ");
2410               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2411               dump_printf (MSG_NOTE, "\n");
2412             }
2413           return false;
2414         }
2415       groupsize = absu_hwi (dr_step) / type_size;
2416     }
2417   else
2418     groupsize = 0;
2419
2420   /* Not consecutive access is possible only if it is a part of interleaving.  */
2421   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2422     {
2423       /* Check if it this DR is a part of interleaving, and is a single
2424          element of the group that is accessed in the loop.  */
2425
2426       /* Gaps are supported only for loads. STEP must be a multiple of the type
2427          size.  The size of the group must be a power of 2.  */
2428       if (DR_IS_READ (dr)
2429           && (dr_step % type_size) == 0
2430           && groupsize > 0
2431           && pow2p_hwi (groupsize))
2432         {
2433           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2434           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2435           GROUP_GAP (stmt_info) = groupsize - 1;
2436           if (dump_enabled_p ())
2437             {
2438               dump_printf_loc (MSG_NOTE, vect_location,
2439                                "Detected single element interleaving ");
2440               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2441               dump_printf (MSG_NOTE, " step ");
2442               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2443               dump_printf (MSG_NOTE, "\n");
2444             }
2445
2446           return true;
2447         }
2448
2449       if (dump_enabled_p ())
2450         {
2451           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452                            "not consecutive access ");
2453           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2454         }
2455
2456       if (bb_vinfo)
2457         {
2458           /* Mark the statement as unvectorizable.  */
2459           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2460           return true;
2461         }
2462
2463       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2464       STMT_VINFO_STRIDED_P (stmt_info) = true;
2465       return true;
2466     }
2467
2468   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2469     {
2470       /* First stmt in the interleaving chain. Check the chain.  */
2471       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2472       struct data_reference *data_ref = dr;
2473       unsigned int count = 1;
2474       tree prev_init = DR_INIT (data_ref);
2475       gimple *prev = stmt;
2476       HOST_WIDE_INT diff, gaps = 0;
2477
2478       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2479       while (next)
2480         {
2481           /* Skip same data-refs.  In case that two or more stmts share
2482              data-ref (supported only for loads), we vectorize only the first
2483              stmt, and the rest get their vectorized loads from the first
2484              one.  */
2485           if (!tree_int_cst_compare (DR_INIT (data_ref),
2486                                      DR_INIT (STMT_VINFO_DATA_REF (
2487                                                    vinfo_for_stmt (next)))))
2488             {
2489               if (DR_IS_WRITE (data_ref))
2490                 {
2491                   if (dump_enabled_p ())
2492                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2493                                      "Two store stmts share the same dr.\n");
2494                   return false;
2495                 }
2496
2497               if (dump_enabled_p ())
2498                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499                                  "Two or more load stmts share the same dr.\n");
2500
2501               /* For load use the same data-ref load.  */
2502               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2503
2504               prev = next;
2505               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2506               continue;
2507             }
2508
2509           prev = next;
2510           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2511
2512           /* All group members have the same STEP by construction.  */
2513           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2514
2515           /* Check that the distance between two accesses is equal to the type
2516              size. Otherwise, we have gaps.  */
2517           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2518                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2519           if (diff != 1)
2520             {
2521               /* FORNOW: SLP of accesses with gaps is not supported.  */
2522               slp_impossible = true;
2523               if (DR_IS_WRITE (data_ref))
2524                 {
2525                   if (dump_enabled_p ())
2526                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527                                      "interleaved store with gaps\n");
2528                   return false;
2529                 }
2530
2531               gaps += diff - 1;
2532             }
2533
2534           last_accessed_element += diff;
2535
2536           /* Store the gap from the previous member of the group. If there is no
2537              gap in the access, GROUP_GAP is always 1.  */
2538           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2539
2540           prev_init = DR_INIT (data_ref);
2541           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2542           /* Count the number of data-refs in the chain.  */
2543           count++;
2544         }
2545
2546       if (groupsize == 0)
2547         groupsize = count + gaps;
2548
2549       /* This could be UINT_MAX but as we are generating code in a very
2550          inefficient way we have to cap earlier.  See PR78699 for example.  */
2551       if (groupsize > 4096)
2552         {
2553           if (dump_enabled_p ())
2554             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2555                              "group is too large\n");
2556           return false;
2557         }
2558
2559       /* Check that the size of the interleaving is equal to count for stores,
2560          i.e., that there are no gaps.  */
2561       if (groupsize != count
2562           && !DR_IS_READ (dr))
2563         {
2564           if (dump_enabled_p ())
2565             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2566                              "interleaved store with gaps\n");
2567           return false;
2568         }
2569
2570       /* If there is a gap after the last load in the group it is the
2571          difference between the groupsize and the last accessed
2572          element.
2573          When there is no gap, this difference should be 0.  */
2574       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2575
2576       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2577       if (dump_enabled_p ())
2578         {
2579           dump_printf_loc (MSG_NOTE, vect_location,
2580                            "Detected interleaving ");
2581           if (DR_IS_READ (dr))
2582             dump_printf (MSG_NOTE, "load ");
2583           else
2584             dump_printf (MSG_NOTE, "store ");
2585           dump_printf (MSG_NOTE, "of size %u starting with ",
2586                        (unsigned)groupsize);
2587           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2588           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2589             dump_printf_loc (MSG_NOTE, vect_location,
2590                              "There is a gap of %u elements after the group\n",
2591                              GROUP_GAP (vinfo_for_stmt (stmt)));
2592         }
2593
2594       /* SLP: create an SLP data structure for every interleaving group of
2595          stores for further analysis in vect_analyse_slp.  */
2596       if (DR_IS_WRITE (dr) && !slp_impossible)
2597         {
2598           if (loop_vinfo)
2599             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2600           if (bb_vinfo)
2601             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2602         }
2603     }
2604
2605   return true;
2606 }
2607
2608 /* Analyze groups of accesses: check that DR belongs to a group of
2609    accesses of legal size, step, etc.  Detect gaps, single element
2610    interleaving, and other special cases. Set grouped access info.
2611    Collect groups of strided stores for further use in SLP analysis.  */
2612
2613 static bool
2614 vect_analyze_group_access (struct data_reference *dr)
2615 {
2616   if (!vect_analyze_group_access_1 (dr))
2617     {
2618       /* Dissolve the group if present.  */
2619       gimple *next;
2620       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2621       while (stmt)
2622         {
2623           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2624           next = GROUP_NEXT_ELEMENT (vinfo);
2625           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2626           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2627           stmt = next;
2628         }
2629       return false;
2630     }
2631   return true;
2632 }
2633
2634 /* Analyze the access pattern of the data-reference DR.
2635    In case of non-consecutive accesses call vect_analyze_group_access() to
2636    analyze groups of accesses.  */
2637
2638 static bool
2639 vect_analyze_data_ref_access (struct data_reference *dr)
2640 {
2641   tree step = DR_STEP (dr);
2642   tree scalar_type = TREE_TYPE (DR_REF (dr));
2643   gimple *stmt = DR_STMT (dr);
2644   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2645   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2646   struct loop *loop = NULL;
2647
2648   if (loop_vinfo)
2649     loop = LOOP_VINFO_LOOP (loop_vinfo);
2650
2651   if (loop_vinfo && !step)
2652     {
2653       if (dump_enabled_p ())
2654         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2655                          "bad data-ref access in loop\n");
2656       return false;
2657     }
2658
2659   /* Allow loads with zero step in inner-loop vectorization.  */
2660   if (loop_vinfo && integer_zerop (step))
2661     {
2662       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2663       if (!nested_in_vect_loop_p (loop, stmt))
2664         return DR_IS_READ (dr);
2665       /* Allow references with zero step for outer loops marked
2666          with pragma omp simd only - it guarantees absence of
2667          loop-carried dependencies between inner loop iterations.  */
2668       if (!loop->force_vectorize)
2669         {
2670           if (dump_enabled_p ())
2671             dump_printf_loc (MSG_NOTE, vect_location,
2672                              "zero step in inner loop of nest\n");
2673           return false;
2674         }
2675     }
2676
2677   if (loop && nested_in_vect_loop_p (loop, stmt))
2678     {
2679       /* Interleaved accesses are not yet supported within outer-loop
2680         vectorization for references in the inner-loop.  */
2681       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2682
2683       /* For the rest of the analysis we use the outer-loop step.  */
2684       step = STMT_VINFO_DR_STEP (stmt_info);
2685       if (integer_zerop (step))
2686         {
2687           if (dump_enabled_p ())
2688             dump_printf_loc (MSG_NOTE, vect_location,
2689                              "zero step in outer loop.\n");
2690           return DR_IS_READ (dr);
2691         }
2692     }
2693
2694   /* Consecutive?  */
2695   if (TREE_CODE (step) == INTEGER_CST)
2696     {
2697       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2698       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2699           || (dr_step < 0
2700               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2701         {
2702           /* Mark that it is not interleaving.  */
2703           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2704           return true;
2705         }
2706     }
2707
2708   if (loop && nested_in_vect_loop_p (loop, stmt))
2709     {
2710       if (dump_enabled_p ())
2711         dump_printf_loc (MSG_NOTE, vect_location,
2712                          "grouped access in outer loop.\n");
2713       return false;
2714     }
2715
2716
2717   /* Assume this is a DR handled by non-constant strided load case.  */
2718   if (TREE_CODE (step) != INTEGER_CST)
2719     return (STMT_VINFO_STRIDED_P (stmt_info)
2720             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2721                 || vect_analyze_group_access (dr)));
2722
2723   /* Not consecutive access - check if it's a part of interleaving group.  */
2724   return vect_analyze_group_access (dr);
2725 }
2726
2727 /* Compare two data-references DRA and DRB to group them into chunks
2728    suitable for grouping.  */
2729
2730 static int
2731 dr_group_sort_cmp (const void *dra_, const void *drb_)
2732 {
2733   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2734   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2735   int cmp;
2736
2737   /* Stabilize sort.  */
2738   if (dra == drb)
2739     return 0;
2740
2741   /* DRs in different loops never belong to the same group.  */
2742   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2743   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2744   if (loopa != loopb)
2745     return loopa->num < loopb->num ? -1 : 1;
2746
2747   /* Ordering of DRs according to base.  */
2748   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2749                                DR_BASE_ADDRESS (drb));
2750   if (cmp != 0)
2751     return cmp;
2752
2753   /* And according to DR_OFFSET.  */
2754   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2755   if (cmp != 0)
2756     return cmp;
2757
2758   /* Put reads before writes.  */
2759   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2760     return DR_IS_READ (dra) ? -1 : 1;
2761
2762   /* Then sort after access size.  */
2763   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2764                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2765   if (cmp != 0)
2766     return cmp;
2767
2768   /* And after step.  */
2769   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2770   if (cmp != 0)
2771     return cmp;
2772
2773   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2774   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2775   if (cmp == 0)
2776     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2777   return cmp;
2778 }
2779
2780 /* Function vect_analyze_data_ref_accesses.
2781
2782    Analyze the access pattern of all the data references in the loop.
2783
2784    FORNOW: the only access pattern that is considered vectorizable is a
2785            simple step 1 (consecutive) access.
2786
2787    FORNOW: handle only arrays and pointer accesses.  */
2788
2789 bool
2790 vect_analyze_data_ref_accesses (vec_info *vinfo)
2791 {
2792   unsigned int i;
2793   vec<data_reference_p> datarefs = vinfo->datarefs;
2794   struct data_reference *dr;
2795
2796   if (dump_enabled_p ())
2797     dump_printf_loc (MSG_NOTE, vect_location,
2798                      "=== vect_analyze_data_ref_accesses ===\n");
2799
2800   if (datarefs.is_empty ())
2801     return true;
2802
2803   /* Sort the array of datarefs to make building the interleaving chains
2804      linear.  Don't modify the original vector's order, it is needed for
2805      determining what dependencies are reversed.  */
2806   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2807   datarefs_copy.qsort (dr_group_sort_cmp);
2808
2809   /* Build the interleaving chains.  */
2810   for (i = 0; i < datarefs_copy.length () - 1;)
2811     {
2812       data_reference_p dra = datarefs_copy[i];
2813       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2814       stmt_vec_info lastinfo = NULL;
2815       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2816         {
2817           ++i;
2818           continue;
2819         }
2820       for (i = i + 1; i < datarefs_copy.length (); ++i)
2821         {
2822           data_reference_p drb = datarefs_copy[i];
2823           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2824           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2825             break;
2826
2827           /* ???  Imperfect sorting (non-compatible types, non-modulo
2828              accesses, same accesses) can lead to a group to be artificially
2829              split here as we don't just skip over those.  If it really
2830              matters we can push those to a worklist and re-iterate
2831              over them.  The we can just skip ahead to the next DR here.  */
2832
2833           /* DRs in a different loop should not be put into the same
2834              interleaving group.  */
2835           if (gimple_bb (DR_STMT (dra))->loop_father
2836               != gimple_bb (DR_STMT (drb))->loop_father)
2837             break;
2838
2839           /* Check that the data-refs have same first location (except init)
2840              and they are both either store or load (not load and store,
2841              not masked loads or stores).  */
2842           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2843               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2844                                         DR_BASE_ADDRESS (drb)) != 0
2845               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2846               || !gimple_assign_single_p (DR_STMT (dra))
2847               || !gimple_assign_single_p (DR_STMT (drb)))
2848             break;
2849
2850           /* Check that the data-refs have the same constant size.  */
2851           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2852           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2853           if (!tree_fits_uhwi_p (sza)
2854               || !tree_fits_uhwi_p (szb)
2855               || !tree_int_cst_equal (sza, szb))
2856             break;
2857
2858           /* Check that the data-refs have the same step.  */
2859           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2860             break;
2861
2862           /* Check the types are compatible.
2863              ???  We don't distinguish this during sorting.  */
2864           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2865                                    TREE_TYPE (DR_REF (drb))))
2866             break;
2867
2868           /* Check that the DR_INITs are compile-time constants.  */
2869           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2870               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2871             break;
2872
2873           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2874           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2875           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2876           HOST_WIDE_INT init_prev
2877             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2878           gcc_assert (init_a <= init_b
2879                       && init_a <= init_prev
2880                       && init_prev <= init_b);
2881
2882           /* Do not place the same access in the interleaving chain twice.  */
2883           if (init_b == init_prev)
2884             {
2885               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2886                           < gimple_uid (DR_STMT (drb)));
2887               /* ???  For now we simply "drop" the later reference which is
2888                  otherwise the same rather than finishing off this group.
2889                  In the end we'd want to re-process duplicates forming
2890                  multiple groups from the refs, likely by just collecting
2891                  all candidates (including duplicates and split points
2892                  below) in a vector and then process them together.  */
2893               continue;
2894             }
2895
2896           /* If init_b == init_a + the size of the type * k, we have an
2897              interleaving, and DRA is accessed before DRB.  */
2898           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2899           if (type_size_a == 0
2900               || (init_b - init_a) % type_size_a != 0)
2901             break;
2902
2903           /* If we have a store, the accesses are adjacent.  This splits
2904              groups into chunks we support (we don't support vectorization
2905              of stores with gaps).  */
2906           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2907             break;
2908
2909           /* If the step (if not zero or non-constant) is greater than the
2910              difference between data-refs' inits this splits groups into
2911              suitable sizes.  */
2912           if (tree_fits_shwi_p (DR_STEP (dra)))
2913             {
2914               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2915               if (step != 0 && step <= (init_b - init_a))
2916                 break;
2917             }
2918
2919           if (dump_enabled_p ())
2920             {
2921               dump_printf_loc (MSG_NOTE, vect_location,
2922                                "Detected interleaving ");
2923               if (DR_IS_READ (dra))
2924                 dump_printf (MSG_NOTE, "load ");
2925               else
2926                 dump_printf (MSG_NOTE, "store ");
2927               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2928               dump_printf (MSG_NOTE,  " and ");
2929               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2930               dump_printf (MSG_NOTE, "\n");
2931             }
2932
2933           /* Link the found element into the group list.  */
2934           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2935             {
2936               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2937               lastinfo = stmtinfo_a;
2938             }
2939           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2940           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2941           lastinfo = stmtinfo_b;
2942         }
2943     }
2944
2945   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2946     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2947         && !vect_analyze_data_ref_access (dr))
2948       {
2949         if (dump_enabled_p ())
2950           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2951                            "not vectorized: complicated access pattern.\n");
2952
2953         if (is_a <bb_vec_info> (vinfo))
2954           {
2955             /* Mark the statement as not vectorizable.  */
2956             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2957             continue;
2958           }
2959         else
2960           {
2961             datarefs_copy.release ();
2962             return false;
2963           }
2964       }
2965
2966   datarefs_copy.release ();
2967   return true;
2968 }
2969
2970 /* Function vect_vfa_segment_size.
2971
2972    Create an expression that computes the size of segment
2973    that will be accessed for a data reference.  The functions takes into
2974    account that realignment loads may access one more vector.
2975
2976    Input:
2977      DR: The data reference.
2978      LENGTH_FACTOR: segment length to consider.
2979
2980    Return an expression whose value is the size of segment which will be
2981    accessed by DR.  */
2982
2983 static tree
2984 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2985 {
2986   tree segment_length;
2987
2988   if (integer_zerop (DR_STEP (dr)))
2989     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2990   else
2991     segment_length = size_binop (MULT_EXPR,
2992                                  fold_convert (sizetype, DR_STEP (dr)),
2993                                  fold_convert (sizetype, length_factor));
2994
2995   if (vect_supportable_dr_alignment (dr, false)
2996         == dr_explicit_realign_optimized)
2997     {
2998       tree vector_size = TYPE_SIZE_UNIT
2999                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3000
3001       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3002     }
3003   return segment_length;
3004 }
3005
3006 /* Function vect_no_alias_p.
3007
3008    Given data references A and B with equal base and offset, see whether
3009    the alias relation can be decided at compilation time.  Return 1 if
3010    it can and the references alias, 0 if it can and the references do
3011    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A
3012    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3013    respectively.  */
3014
3015 static int
3016 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3017                          tree segment_length_a, tree segment_length_b)
3018 {
3019   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3020   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3021   poly_uint64 const_length_a;
3022   poly_uint64 const_length_b;
3023
3024   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3025      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3026      [a, a+12) */
3027   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3028     {
3029       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3030       offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3031     }
3032   else
3033     const_length_a = tree_to_poly_uint64 (segment_length_a);
3034   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3035     {
3036       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3037       offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3038     }
3039   else
3040     const_length_b = tree_to_poly_uint64 (segment_length_b);
3041
3042   if (ranges_known_overlap_p (offset_a, const_length_a,
3043                               offset_b, const_length_b))
3044     return 1;
3045
3046   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3047                                offset_b, const_length_b))
3048     return 0;
3049
3050   return -1;
3051 }
3052
3053 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3054    in DDR is >= VF.  */
3055
3056 static bool
3057 dependence_distance_ge_vf (data_dependence_relation *ddr,
3058                            unsigned int loop_depth, poly_uint64 vf)
3059 {
3060   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3061       || DDR_NUM_DIST_VECTS (ddr) == 0)
3062     return false;
3063
3064   /* If the dependence is exact, we should have limited the VF instead.  */
3065   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3066
3067   unsigned int i;
3068   lambda_vector dist_v;
3069   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3070     {
3071       HOST_WIDE_INT dist = dist_v[loop_depth];
3072       if (dist != 0
3073           && !(dist > 0 && DDR_REVERSED_P (ddr))
3074           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3075         return false;
3076     }
3077
3078   if (dump_enabled_p ())
3079     {
3080       dump_printf_loc (MSG_NOTE, vect_location,
3081                        "dependence distance between ");
3082       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3083       dump_printf (MSG_NOTE,  " and ");
3084       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3085       dump_printf (MSG_NOTE,  " is >= VF\n");
3086     }
3087
3088   return true;
3089 }
3090
3091 /* Function vect_prune_runtime_alias_test_list.
3092
3093    Prune a list of ddrs to be tested at run-time by versioning for alias.
3094    Merge several alias checks into one if possible.
3095    Return FALSE if resulting list of ddrs is longer then allowed by
3096    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3097
3098 bool
3099 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3100 {
3101   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3102   hash_set <tree_pair_hash> compared_objects;
3103
3104   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3105   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3106     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3107   vec<vec_object_pair> &check_unequal_addrs
3108     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3109   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3110   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3111
3112   ddr_p ddr;
3113   unsigned int i;
3114   tree length_factor;
3115
3116   if (dump_enabled_p ())
3117     dump_printf_loc (MSG_NOTE, vect_location,
3118                      "=== vect_prune_runtime_alias_test_list ===\n");
3119
3120   if (may_alias_ddrs.is_empty ())
3121     return true;
3122
3123   comp_alias_ddrs.create (may_alias_ddrs.length ());
3124
3125   unsigned int loop_depth
3126     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3127                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3128
3129   /* First, we collect all data ref pairs for aliasing checks.  */
3130   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3131     {
3132       int comp_res;
3133       struct data_reference *dr_a, *dr_b;
3134       gimple *dr_group_first_a, *dr_group_first_b;
3135       tree segment_length_a, segment_length_b;
3136       gimple *stmt_a, *stmt_b;
3137
3138       /* Ignore the alias if the VF we chose ended up being no greater
3139          than the dependence distance.  */
3140       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3141         continue;
3142
3143       if (DDR_OBJECT_A (ddr))
3144         {
3145           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3146           if (!compared_objects.add (new_pair))
3147             {
3148               if (dump_enabled_p ())
3149                 {
3150                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3151                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3152                   dump_printf (MSG_NOTE, " and ");
3153                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3154                   dump_printf (MSG_NOTE, " have different addresses\n");
3155                 }
3156               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3157             }
3158           continue;
3159         }
3160
3161       dr_a = DDR_A (ddr);
3162       stmt_a = DR_STMT (DDR_A (ddr));
3163       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3164       if (dr_group_first_a)
3165         {
3166           stmt_a = dr_group_first_a;
3167           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3168         }
3169
3170       dr_b = DDR_B (ddr);
3171       stmt_b = DR_STMT (DDR_B (ddr));
3172       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3173       if (dr_group_first_b)
3174         {
3175           stmt_b = dr_group_first_b;
3176           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3177         }
3178
3179       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3180         length_factor = scalar_loop_iters;
3181       else
3182         length_factor = size_int (vect_factor);
3183       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3184       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3185
3186       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3187                                         DR_BASE_ADDRESS (dr_b));
3188       if (comp_res == 0)
3189         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3190                                           DR_OFFSET (dr_b));
3191
3192       /* See whether the alias is known at compilation time.  */
3193       if (comp_res == 0
3194           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3195           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3196           && poly_int_tree_p (segment_length_a)
3197           && poly_int_tree_p (segment_length_b))
3198         {
3199           int res = vect_compile_time_alias (dr_a, dr_b,
3200                                              segment_length_a,
3201                                              segment_length_b);
3202           if (res == 0)
3203             continue;
3204
3205           if (res == 1)
3206             {
3207               if (dump_enabled_p ())
3208                 dump_printf_loc (MSG_NOTE, vect_location,
3209                                  "not vectorized: compilation time alias.\n");
3210               return false;
3211             }
3212         }
3213
3214       dr_with_seg_len_pair_t dr_with_seg_len_pair
3215           (dr_with_seg_len (dr_a, segment_length_a),
3216            dr_with_seg_len (dr_b, segment_length_b));
3217
3218       /* Canonicalize pairs by sorting the two DR members.  */
3219       if (comp_res > 0)
3220         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3221
3222       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3223     }
3224
3225   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3226
3227   unsigned int count = (comp_alias_ddrs.length ()
3228                         + check_unequal_addrs.length ());
3229   dump_printf_loc (MSG_NOTE, vect_location,
3230                    "improved number of alias checks from %d to %d\n",
3231                    may_alias_ddrs.length (), count);
3232   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3233     {
3234       if (dump_enabled_p ())
3235         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236                          "number of versioning for alias "
3237                          "run-time tests exceeds %d "
3238                          "(--param vect-max-version-for-alias-checks)\n",
3239                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3240       return false;
3241     }
3242
3243   return true;
3244 }
3245
3246 /* Return true if a non-affine read or write in STMT is suitable for a
3247    gather load or scatter store.  Describe the operation in *INFO if so.  */
3248
3249 bool
3250 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3251                            gather_scatter_info *info)
3252 {
3253   HOST_WIDE_INT scale = 1;
3254   poly_int64 pbitpos, pbitsize;
3255   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3256   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3257   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3258   tree offtype = NULL_TREE;
3259   tree decl, base, off;
3260   machine_mode pmode;
3261   int punsignedp, reversep, pvolatilep = 0;
3262
3263   base = DR_REF (dr);
3264   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3265      see if we can use the def stmt of the address.  */
3266   if (is_gimple_call (stmt)
3267       && gimple_call_internal_p (stmt)
3268       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3269           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3270       && TREE_CODE (base) == MEM_REF
3271       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3272       && integer_zerop (TREE_OPERAND (base, 1))
3273       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3274     {
3275       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3276       if (is_gimple_assign (def_stmt)
3277           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3278         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3279     }
3280
3281   /* The gather and scatter builtins need address of the form
3282      loop_invariant + vector * {1, 2, 4, 8}
3283      or
3284      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3285      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3286      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3287      multiplications and additions in it.  To get a vector, we need
3288      a single SSA_NAME that will be defined in the loop and will
3289      contain everything that is not loop invariant and that can be
3290      vectorized.  The following code attempts to find such a preexistng
3291      SSA_NAME OFF and put the loop invariants into a tree BASE
3292      that can be gimplified before the loop.  */
3293   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3294                               &punsignedp, &reversep, &pvolatilep);
3295   gcc_assert (base && !reversep);
3296   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3297
3298   if (TREE_CODE (base) == MEM_REF)
3299     {
3300       if (!integer_zerop (TREE_OPERAND (base, 1)))
3301         {
3302           if (off == NULL_TREE)
3303             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3304           else
3305             off = size_binop (PLUS_EXPR, off,
3306                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3307         }
3308       base = TREE_OPERAND (base, 0);
3309     }
3310   else
3311     base = build_fold_addr_expr (base);
3312
3313   if (off == NULL_TREE)
3314     off = size_zero_node;
3315
3316   /* If base is not loop invariant, either off is 0, then we start with just
3317      the constant offset in the loop invariant BASE and continue with base
3318      as OFF, otherwise give up.
3319      We could handle that case by gimplifying the addition of base + off
3320      into some SSA_NAME and use that as off, but for now punt.  */
3321   if (!expr_invariant_in_loop_p (loop, base))
3322     {
3323       if (!integer_zerop (off))
3324         return false;
3325       off = base;
3326       base = size_int (pbytepos);
3327     }
3328   /* Otherwise put base + constant offset into the loop invariant BASE
3329      and continue with OFF.  */
3330   else
3331     {
3332       base = fold_convert (sizetype, base);
3333       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3334     }
3335
3336   /* OFF at this point may be either a SSA_NAME or some tree expression
3337      from get_inner_reference.  Try to peel off loop invariants from it
3338      into BASE as long as possible.  */
3339   STRIP_NOPS (off);
3340   while (offtype == NULL_TREE)
3341     {
3342       enum tree_code code;
3343       tree op0, op1, add = NULL_TREE;
3344
3345       if (TREE_CODE (off) == SSA_NAME)
3346         {
3347           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3348
3349           if (expr_invariant_in_loop_p (loop, off))
3350             return false;
3351
3352           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3353             break;
3354
3355           op0 = gimple_assign_rhs1 (def_stmt);
3356           code = gimple_assign_rhs_code (def_stmt);
3357           op1 = gimple_assign_rhs2 (def_stmt);
3358         }
3359       else
3360         {
3361           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3362             return false;
3363           code = TREE_CODE (off);
3364           extract_ops_from_tree (off, &code, &op0, &op1);
3365         }
3366       switch (code)
3367         {
3368         case POINTER_PLUS_EXPR:
3369         case PLUS_EXPR:
3370           if (expr_invariant_in_loop_p (loop, op0))
3371             {
3372               add = op0;
3373               off = op1;
3374             do_add:
3375               add = fold_convert (sizetype, add);
3376               if (scale != 1)
3377                 add = size_binop (MULT_EXPR, add, size_int (scale));
3378               base = size_binop (PLUS_EXPR, base, add);
3379               continue;
3380             }
3381           if (expr_invariant_in_loop_p (loop, op1))
3382             {
3383               add = op1;
3384               off = op0;
3385               goto do_add;
3386             }
3387           break;
3388         case MINUS_EXPR:
3389           if (expr_invariant_in_loop_p (loop, op1))
3390             {
3391               add = fold_convert (sizetype, op1);
3392               add = size_binop (MINUS_EXPR, size_zero_node, add);
3393               off = op0;
3394               goto do_add;
3395             }
3396           break;
3397         case MULT_EXPR:
3398           if (scale == 1 && tree_fits_shwi_p (op1))
3399             {
3400               scale = tree_to_shwi (op1);
3401               off = op0;
3402               continue;
3403             }
3404           break;
3405         case SSA_NAME:
3406           off = op0;
3407           continue;
3408         CASE_CONVERT:
3409           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3410               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3411             break;
3412           if (TYPE_PRECISION (TREE_TYPE (op0))
3413               == TYPE_PRECISION (TREE_TYPE (off)))
3414             {
3415               off = op0;
3416               continue;
3417             }
3418           if (TYPE_PRECISION (TREE_TYPE (op0))
3419               < TYPE_PRECISION (TREE_TYPE (off)))
3420             {
3421               off = op0;
3422               offtype = TREE_TYPE (off);
3423               STRIP_NOPS (off);
3424               continue;
3425             }
3426           break;
3427         default:
3428           break;
3429         }
3430       break;
3431     }
3432
3433   /* If at the end OFF still isn't a SSA_NAME or isn't
3434      defined in the loop, punt.  */
3435   if (TREE_CODE (off) != SSA_NAME
3436       || expr_invariant_in_loop_p (loop, off))
3437     return false;
3438
3439   if (offtype == NULL_TREE)
3440     offtype = TREE_TYPE (off);
3441
3442   if (DR_IS_READ (dr))
3443     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3444                                              offtype, scale);
3445   else
3446     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3447                                               offtype, scale);
3448
3449   if (decl == NULL_TREE)
3450     return false;
3451
3452   info->decl = decl;
3453   info->base = base;
3454   info->offset = off;
3455   info->offset_dt = vect_unknown_def_type;
3456   info->offset_vectype = NULL_TREE;
3457   info->scale = scale;
3458   return true;
3459 }
3460
3461 /* Function vect_analyze_data_refs.
3462
3463   Find all the data references in the loop or basic block.
3464
3465    The general structure of the analysis of data refs in the vectorizer is as
3466    follows:
3467    1- vect_analyze_data_refs(loop/bb): call
3468       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3469       in the loop/bb and their dependences.
3470    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3471    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3472    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3473
3474 */
3475
3476 bool
3477 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3478 {
3479   struct loop *loop = NULL;
3480   unsigned int i;
3481   struct data_reference *dr;
3482   tree scalar_type;
3483
3484   if (dump_enabled_p ())
3485     dump_printf_loc (MSG_NOTE, vect_location,
3486                      "=== vect_analyze_data_refs ===\n");
3487
3488   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3489     loop = LOOP_VINFO_LOOP (loop_vinfo);
3490
3491   /* Go through the data-refs, check that the analysis succeeded.  Update
3492      pointer from stmt_vec_info struct to DR and vectype.  */
3493
3494   vec<data_reference_p> datarefs = vinfo->datarefs;
3495   FOR_EACH_VEC_ELT (datarefs, i, dr)
3496     {
3497       gimple *stmt;
3498       stmt_vec_info stmt_info;
3499       tree base, offset, init;
3500       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3501       bool simd_lane_access = false;
3502       poly_uint64 vf;
3503
3504 again:
3505       if (!dr || !DR_REF (dr))
3506         {
3507           if (dump_enabled_p ())
3508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3509                              "not vectorized: unhandled data-ref\n");
3510           return false;
3511         }
3512
3513       stmt = DR_STMT (dr);
3514       stmt_info = vinfo_for_stmt (stmt);
3515
3516       /* Discard clobbers from the dataref vector.  We will remove
3517          clobber stmts during vectorization.  */
3518       if (gimple_clobber_p (stmt))
3519         {
3520           free_data_ref (dr);
3521           if (i == datarefs.length () - 1)
3522             {
3523               datarefs.pop ();
3524               break;
3525             }
3526           datarefs.ordered_remove (i);
3527           dr = datarefs[i];
3528           goto again;
3529         }
3530
3531       /* Check that analysis of the data-ref succeeded.  */
3532       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3533           || !DR_STEP (dr))
3534         {
3535           bool maybe_gather
3536             = DR_IS_READ (dr)
3537               && !TREE_THIS_VOLATILE (DR_REF (dr))
3538               && targetm.vectorize.builtin_gather != NULL;
3539           bool maybe_scatter
3540             = DR_IS_WRITE (dr)
3541               && !TREE_THIS_VOLATILE (DR_REF (dr))
3542               && targetm.vectorize.builtin_scatter != NULL;
3543           bool maybe_simd_lane_access
3544             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3545
3546           /* If target supports vector gather loads or scatter stores, or if
3547              this might be a SIMD lane access, see if they can't be used.  */
3548           if (is_a <loop_vec_info> (vinfo)
3549               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3550               && !nested_in_vect_loop_p (loop, stmt))
3551             {
3552               struct data_reference *newdr
3553                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3554                                    DR_REF (dr), stmt, !maybe_scatter,
3555                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3556               gcc_assert (newdr != NULL && DR_REF (newdr));
3557               if (DR_BASE_ADDRESS (newdr)
3558                   && DR_OFFSET (newdr)
3559                   && DR_INIT (newdr)
3560                   && DR_STEP (newdr)
3561                   && integer_zerop (DR_STEP (newdr)))
3562                 {
3563                   if (maybe_simd_lane_access)
3564                     {
3565                       tree off = DR_OFFSET (newdr);
3566                       STRIP_NOPS (off);
3567                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3568                           && TREE_CODE (off) == MULT_EXPR
3569                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3570                         {
3571                           tree step = TREE_OPERAND (off, 1);
3572                           off = TREE_OPERAND (off, 0);
3573                           STRIP_NOPS (off);
3574                           if (CONVERT_EXPR_P (off)
3575                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3576                                                                           0)))
3577                                  < TYPE_PRECISION (TREE_TYPE (off)))
3578                             off = TREE_OPERAND (off, 0);
3579                           if (TREE_CODE (off) == SSA_NAME)
3580                             {
3581                               gimple *def = SSA_NAME_DEF_STMT (off);
3582                               tree reft = TREE_TYPE (DR_REF (newdr));
3583                               if (is_gimple_call (def)
3584                                   && gimple_call_internal_p (def)
3585                                   && (gimple_call_internal_fn (def)
3586                                       == IFN_GOMP_SIMD_LANE))
3587                                 {
3588                                   tree arg = gimple_call_arg (def, 0);
3589                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3590                                   arg = SSA_NAME_VAR (arg);
3591                                   if (arg == loop->simduid
3592                                       /* For now.  */
3593                                       && tree_int_cst_equal
3594                                            (TYPE_SIZE_UNIT (reft),
3595                                             step))
3596                                     {
3597                                       DR_OFFSET (newdr) = ssize_int (0);
3598                                       DR_STEP (newdr) = step;
3599                                       DR_OFFSET_ALIGNMENT (newdr)
3600                                         = BIGGEST_ALIGNMENT;
3601                                       DR_STEP_ALIGNMENT (newdr)
3602                                         = highest_pow2_factor (step);
3603                                       dr = newdr;
3604                                       simd_lane_access = true;
3605                                     }
3606                                 }
3607                             }
3608                         }
3609                     }
3610                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3611                     {
3612                       dr = newdr;
3613                       if (maybe_gather)
3614                         gatherscatter = GATHER;
3615                       else
3616                         gatherscatter = SCATTER;
3617                     }
3618                 }
3619               if (gatherscatter == SG_NONE && !simd_lane_access)
3620                 free_data_ref (newdr);
3621             }
3622
3623           if (gatherscatter == SG_NONE && !simd_lane_access)
3624             {
3625               if (dump_enabled_p ())
3626                 {
3627                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628                                    "not vectorized: data ref analysis "
3629                                    "failed ");
3630                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3631                 }
3632
3633               if (is_a <bb_vec_info> (vinfo))
3634                 break;
3635
3636               return false;
3637             }
3638         }
3639
3640       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3641         {
3642           if (dump_enabled_p ())
3643             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644                              "not vectorized: base addr of dr is a "
3645                              "constant\n");
3646
3647           if (is_a <bb_vec_info> (vinfo))
3648             break;
3649
3650           if (gatherscatter != SG_NONE || simd_lane_access)
3651             free_data_ref (dr);
3652           return false;
3653         }
3654
3655       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3656         {
3657           if (dump_enabled_p ())
3658             {
3659               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660                                "not vectorized: volatile type ");
3661               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3662             }
3663
3664           if (is_a <bb_vec_info> (vinfo))
3665             break;
3666
3667           return false;
3668         }
3669
3670       if (stmt_can_throw_internal (stmt))
3671         {
3672           if (dump_enabled_p ())
3673             {
3674               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3675                                "not vectorized: statement can throw an "
3676                                "exception ");
3677               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3678             }
3679
3680           if (is_a <bb_vec_info> (vinfo))
3681             break;
3682
3683           if (gatherscatter != SG_NONE || simd_lane_access)
3684             free_data_ref (dr);
3685           return false;
3686         }
3687
3688       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3689           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3690         {
3691           if (dump_enabled_p ())
3692             {
3693               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3694                                "not vectorized: statement is bitfield "
3695                                "access ");
3696               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3697             }
3698
3699           if (is_a <bb_vec_info> (vinfo))
3700             break;
3701
3702           if (gatherscatter != SG_NONE || simd_lane_access)
3703             free_data_ref (dr);
3704           return false;
3705         }
3706
3707       base = unshare_expr (DR_BASE_ADDRESS (dr));
3708       offset = unshare_expr (DR_OFFSET (dr));
3709       init = unshare_expr (DR_INIT (dr));
3710
3711       if (is_gimple_call (stmt)
3712           && (!gimple_call_internal_p (stmt)
3713               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3714                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3715         {
3716           if (dump_enabled_p ())
3717             {
3718               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3719                                "not vectorized: dr in a call ");
3720               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3721             }
3722
3723           if (is_a <bb_vec_info> (vinfo))
3724             break;
3725
3726           if (gatherscatter != SG_NONE || simd_lane_access)
3727             free_data_ref (dr);
3728           return false;
3729         }
3730
3731       /* Update DR field in stmt_vec_info struct.  */
3732
3733       /* If the dataref is in an inner-loop of the loop that is considered for
3734          for vectorization, we also want to analyze the access relative to
3735          the outer-loop (DR contains information only relative to the
3736          inner-most enclosing loop).  We do that by building a reference to the
3737          first location accessed by the inner-loop, and analyze it relative to
3738          the outer-loop.  */
3739       if (loop && nested_in_vect_loop_p (loop, stmt))
3740         {
3741           /* Build a reference to the first location accessed by the
3742              inner loop: *(BASE + INIT + OFFSET).  By construction,
3743              this address must be invariant in the inner loop, so we
3744              can consider it as being used in the outer loop.  */
3745           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3746                                           init, offset);
3747           tree init_addr = fold_build_pointer_plus (base, init_offset);
3748           tree init_ref = build_fold_indirect_ref (init_addr);
3749
3750           if (dump_enabled_p ())
3751             {
3752               dump_printf_loc (MSG_NOTE, vect_location,
3753                                "analyze in outer loop: ");
3754               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3755               dump_printf (MSG_NOTE, "\n");
3756             }
3757
3758           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3759                                      init_ref, loop))
3760             /* dr_analyze_innermost already explained the failure.  */
3761             return false;
3762
3763           if (dump_enabled_p ())
3764             {
3765               dump_printf_loc (MSG_NOTE, vect_location,
3766                                "\touter base_address: ");
3767               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3768                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3769               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3770               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771                                  STMT_VINFO_DR_OFFSET (stmt_info));
3772               dump_printf (MSG_NOTE,
3773                            "\n\touter constant offset from base address: ");
3774               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3775                                  STMT_VINFO_DR_INIT (stmt_info));
3776               dump_printf (MSG_NOTE, "\n\touter step: ");
3777               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3778                                  STMT_VINFO_DR_STEP (stmt_info));
3779               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3780                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3781               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3782                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3783               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3784                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3785               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3786                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3787             }
3788         }
3789
3790       if (STMT_VINFO_DATA_REF (stmt_info))
3791         {
3792           if (dump_enabled_p ())
3793             {
3794               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3795                                "not vectorized: more than one data ref "
3796                                "in stmt: ");
3797               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3798             }
3799
3800           if (is_a <bb_vec_info> (vinfo))
3801             break;
3802
3803           if (gatherscatter != SG_NONE || simd_lane_access)
3804             free_data_ref (dr);
3805           return false;
3806         }
3807
3808       STMT_VINFO_DATA_REF (stmt_info) = dr;
3809       if (simd_lane_access)
3810         {
3811           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3812           free_data_ref (datarefs[i]);
3813           datarefs[i] = dr;
3814         }
3815
3816       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3817           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3818           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3819         {
3820           if (dump_enabled_p ())
3821             {
3822               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3823                                "not vectorized: base object not addressable "
3824                                "for stmt: ");
3825               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3826             }
3827           if (is_a <bb_vec_info> (vinfo))
3828             {
3829               /* In BB vectorization the ref can still participate
3830                  in dependence analysis, we just can't vectorize it.  */
3831               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3832               continue;
3833             }
3834           return false;
3835         }
3836
3837       /* Set vectype for STMT.  */
3838       scalar_type = TREE_TYPE (DR_REF (dr));
3839       STMT_VINFO_VECTYPE (stmt_info)
3840         = get_vectype_for_scalar_type (scalar_type);
3841       if (!STMT_VINFO_VECTYPE (stmt_info))
3842         {
3843           if (dump_enabled_p ())
3844             {
3845               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3846                                "not vectorized: no vectype for stmt: ");
3847               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3848               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3849               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3850                                  scalar_type);
3851               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3852             }
3853
3854           if (is_a <bb_vec_info> (vinfo))
3855             {
3856               /* No vector type is fine, the ref can still participate
3857                  in dependence analysis, we just can't vectorize it.  */
3858               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3859               continue;
3860             }
3861
3862           if (gatherscatter != SG_NONE || simd_lane_access)
3863             {
3864               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3865               if (gatherscatter != SG_NONE)
3866                 free_data_ref (dr);
3867             }
3868           return false;
3869         }
3870       else
3871         {
3872           if (dump_enabled_p ())
3873             {
3874               dump_printf_loc (MSG_NOTE, vect_location,
3875                                "got vectype for stmt: ");
3876               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3877               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3878                                  STMT_VINFO_VECTYPE (stmt_info));
3879               dump_printf (MSG_NOTE, "\n");
3880             }
3881         }
3882
3883       /* Adjust the minimal vectorization factor according to the
3884          vector type.  */
3885       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3886       *min_vf = upper_bound (*min_vf, vf);
3887
3888       if (gatherscatter != SG_NONE)
3889         {
3890           gather_scatter_info gs_info;
3891           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3892                                           &gs_info)
3893               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3894             {
3895               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3896               free_data_ref (dr);
3897               if (dump_enabled_p ())
3898                 {
3899                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3900                                    (gatherscatter == GATHER) ?
3901                                    "not vectorized: not suitable for gather "
3902                                    "load " :
3903                                    "not vectorized: not suitable for scatter "
3904                                    "store ");
3905                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3906                 }
3907               return false;
3908             }
3909
3910           free_data_ref (datarefs[i]);
3911           datarefs[i] = dr;
3912           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3913         }
3914
3915       else if (is_a <loop_vec_info> (vinfo)
3916                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3917         {
3918           if (nested_in_vect_loop_p (loop, stmt))
3919             {
3920               if (dump_enabled_p ())
3921                 {
3922                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3923                                    "not vectorized: not suitable for strided "
3924                                    "load ");
3925                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3926                 }
3927               return false;
3928             }
3929           STMT_VINFO_STRIDED_P (stmt_info) = true;
3930         }
3931     }
3932
3933   /* If we stopped analysis at the first dataref we could not analyze
3934      when trying to vectorize a basic-block mark the rest of the datarefs
3935      as not vectorizable and truncate the vector of datarefs.  That
3936      avoids spending useless time in analyzing their dependence.  */
3937   if (i != datarefs.length ())
3938     {
3939       gcc_assert (is_a <bb_vec_info> (vinfo));
3940       for (unsigned j = i; j < datarefs.length (); ++j)
3941         {
3942           data_reference_p dr = datarefs[j];
3943           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3944           free_data_ref (dr);
3945         }
3946       datarefs.truncate (i);
3947     }
3948
3949   return true;
3950 }
3951
3952
3953 /* Function vect_get_new_vect_var.
3954
3955    Returns a name for a new variable.  The current naming scheme appends the
3956    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3957    the name of vectorizer generated variables, and appends that to NAME if
3958    provided.  */
3959
3960 tree
3961 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3962 {
3963   const char *prefix;
3964   tree new_vect_var;
3965
3966   switch (var_kind)
3967   {
3968   case vect_simple_var:
3969     prefix = "vect";
3970     break;
3971   case vect_scalar_var:
3972     prefix = "stmp";
3973     break;
3974   case vect_mask_var:
3975     prefix = "mask";
3976     break;
3977   case vect_pointer_var:
3978     prefix = "vectp";
3979     break;
3980   default:
3981     gcc_unreachable ();
3982   }
3983
3984   if (name)
3985     {
3986       char* tmp = concat (prefix, "_", name, NULL);
3987       new_vect_var = create_tmp_reg (type, tmp);
3988       free (tmp);
3989     }
3990   else
3991     new_vect_var = create_tmp_reg (type, prefix);
3992
3993   return new_vect_var;
3994 }
3995
3996 /* Like vect_get_new_vect_var but return an SSA name.  */
3997
3998 tree
3999 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4000 {
4001   const char *prefix;
4002   tree new_vect_var;
4003
4004   switch (var_kind)
4005   {
4006   case vect_simple_var:
4007     prefix = "vect";
4008     break;
4009   case vect_scalar_var:
4010     prefix = "stmp";
4011     break;
4012   case vect_pointer_var:
4013     prefix = "vectp";
4014     break;
4015   default:
4016     gcc_unreachable ();
4017   }
4018
4019   if (name)
4020     {
4021       char* tmp = concat (prefix, "_", name, NULL);
4022       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4023       free (tmp);
4024     }
4025   else
4026     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4027
4028   return new_vect_var;
4029 }
4030
4031 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4032
4033 static void
4034 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4035 {
4036   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4037   int misalign = DR_MISALIGNMENT (dr);
4038   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4039     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4040   else
4041     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4042                             DR_TARGET_ALIGNMENT (dr), misalign);
4043 }
4044
4045 /* Function vect_create_addr_base_for_vector_ref.
4046
4047    Create an expression that computes the address of the first memory location
4048    that will be accessed for a data reference.
4049
4050    Input:
4051    STMT: The statement containing the data reference.
4052    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4053    OFFSET: Optional. If supplied, it is be added to the initial address.
4054    LOOP:    Specify relative to which loop-nest should the address be computed.
4055             For example, when the dataref is in an inner-loop nested in an
4056             outer-loop that is now being vectorized, LOOP can be either the
4057             outer-loop, or the inner-loop.  The first memory location accessed
4058             by the following dataref ('in' points to short):
4059
4060                 for (i=0; i<N; i++)
4061                    for (j=0; j<M; j++)
4062                      s += in[i+j]
4063
4064             is as follows:
4065             if LOOP=i_loop:     &in             (relative to i_loop)
4066             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4067    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4068             initial address.  Unlike OFFSET, which is number of elements to
4069             be added, BYTE_OFFSET is measured in bytes.
4070
4071    Output:
4072    1. Return an SSA_NAME whose value is the address of the memory location of
4073       the first vector of the data reference.
4074    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4075       these statement(s) which define the returned SSA_NAME.
4076
4077    FORNOW: We are only handling array accesses with step 1.  */
4078
4079 tree
4080 vect_create_addr_base_for_vector_ref (gimple *stmt,
4081                                       gimple_seq *new_stmt_list,
4082                                       tree offset,
4083                                       tree byte_offset)
4084 {
4085   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4086   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4087   const char *base_name;
4088   tree addr_base;
4089   tree dest;
4090   gimple_seq seq = NULL;
4091   tree vect_ptr_type;
4092   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4093   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4094   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4095
4096   tree data_ref_base = unshare_expr (drb->base_address);
4097   tree base_offset = unshare_expr (drb->offset);
4098   tree init = unshare_expr (drb->init);
4099
4100   if (loop_vinfo)
4101     base_name = get_name (data_ref_base);
4102   else
4103     {
4104       base_offset = ssize_int (0);
4105       init = ssize_int (0);
4106       base_name = get_name (DR_REF (dr));
4107     }
4108
4109   /* Create base_offset */
4110   base_offset = size_binop (PLUS_EXPR,
4111                             fold_convert (sizetype, base_offset),
4112                             fold_convert (sizetype, init));
4113
4114   if (offset)
4115     {
4116       offset = fold_build2 (MULT_EXPR, sizetype,
4117                             fold_convert (sizetype, offset), step);
4118       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4119                                  base_offset, offset);
4120     }
4121   if (byte_offset)
4122     {
4123       byte_offset = fold_convert (sizetype, byte_offset);
4124       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4125                                  base_offset, byte_offset);
4126     }
4127
4128   /* base + base_offset */
4129   if (loop_vinfo)
4130     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4131   else
4132     {
4133       addr_base = build1 (ADDR_EXPR,
4134                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4135                           unshare_expr (DR_REF (dr)));
4136     }
4137
4138   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4139   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4140   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4141   gimple_seq_add_seq (new_stmt_list, seq);
4142
4143   if (DR_PTR_INFO (dr)
4144       && TREE_CODE (addr_base) == SSA_NAME
4145       && !SSA_NAME_PTR_INFO (addr_base))
4146     {
4147       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4148       if (offset || byte_offset)
4149         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4150     }
4151
4152   if (dump_enabled_p ())
4153     {
4154       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4155       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4156       dump_printf (MSG_NOTE, "\n");
4157     }
4158
4159   return addr_base;
4160 }
4161
4162
4163 /* Function vect_create_data_ref_ptr.
4164
4165    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4166    location accessed in the loop by STMT, along with the def-use update
4167    chain to appropriately advance the pointer through the loop iterations.
4168    Also set aliasing information for the pointer.  This pointer is used by
4169    the callers to this function to create a memory reference expression for
4170    vector load/store access.
4171
4172    Input:
4173    1. STMT: a stmt that references memory. Expected to be of the form
4174          GIMPLE_ASSIGN <name, data-ref> or
4175          GIMPLE_ASSIGN <data-ref, name>.
4176    2. AGGR_TYPE: the type of the reference, which should be either a vector
4177         or an array.
4178    3. AT_LOOP: the loop where the vector memref is to be created.
4179    4. OFFSET (optional): an offset to be added to the initial address accessed
4180         by the data-ref in STMT.
4181    5. BSI: location where the new stmts are to be placed if there is no loop
4182    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4183         pointing to the initial address.
4184    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4185         to the initial address accessed by the data-ref in STMT.  This is
4186         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4187         in bytes.
4188
4189    Output:
4190    1. Declare a new ptr to vector_type, and have it point to the base of the
4191       data reference (initial addressed accessed by the data reference).
4192       For example, for vector of type V8HI, the following code is generated:
4193
4194       v8hi *ap;
4195       ap = (v8hi *)initial_address;
4196
4197       if OFFSET is not supplied:
4198          initial_address = &a[init];
4199       if OFFSET is supplied:
4200          initial_address = &a[init + OFFSET];
4201       if BYTE_OFFSET is supplied:
4202          initial_address = &a[init] + BYTE_OFFSET;
4203
4204       Return the initial_address in INITIAL_ADDRESS.
4205
4206    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4207       update the pointer in each iteration of the loop.
4208
4209       Return the increment stmt that updates the pointer in PTR_INCR.
4210
4211    3. Set INV_P to true if the access pattern of the data reference in the
4212       vectorized loop is invariant.  Set it to false otherwise.
4213
4214    4. Return the pointer.  */
4215
4216 tree
4217 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4218                           tree offset, tree *initial_address,
4219                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4220                           bool only_init, bool *inv_p, tree byte_offset)
4221 {
4222   const char *base_name;
4223   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4224   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4225   struct loop *loop = NULL;
4226   bool nested_in_vect_loop = false;
4227   struct loop *containing_loop = NULL;
4228   tree aggr_ptr_type;
4229   tree aggr_ptr;
4230   tree new_temp;
4231   gimple_seq new_stmt_list = NULL;
4232   edge pe = NULL;
4233   basic_block new_bb;
4234   tree aggr_ptr_init;
4235   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4236   tree aptr;
4237   gimple_stmt_iterator incr_gsi;
4238   bool insert_after;
4239   tree indx_before_incr, indx_after_incr;
4240   gimple *incr;
4241   tree step;
4242   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4243
4244   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4245               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4246
4247   if (loop_vinfo)
4248     {
4249       loop = LOOP_VINFO_LOOP (loop_vinfo);
4250       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4251       containing_loop = (gimple_bb (stmt))->loop_father;
4252       pe = loop_preheader_edge (loop);
4253     }
4254   else
4255     {
4256       gcc_assert (bb_vinfo);
4257       only_init = true;
4258       *ptr_incr = NULL;
4259     }
4260
4261   /* Check the step (evolution) of the load in LOOP, and record
4262      whether it's invariant.  */
4263   step = vect_dr_behavior (dr)->step;
4264   if (integer_zerop (step))
4265     *inv_p = true;
4266   else
4267     *inv_p = false;
4268
4269   /* Create an expression for the first address accessed by this load
4270      in LOOP.  */
4271   base_name = get_name (DR_BASE_ADDRESS (dr));
4272
4273   if (dump_enabled_p ())
4274     {
4275       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4276       dump_printf_loc (MSG_NOTE, vect_location,
4277                        "create %s-pointer variable to type: ",
4278                        get_tree_code_name (TREE_CODE (aggr_type)));
4279       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4280       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4281         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4282       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4283         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4284       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4285         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4286       else
4287         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4288       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4289       dump_printf (MSG_NOTE, "\n");
4290     }
4291
4292   /* (1) Create the new aggregate-pointer variable.
4293      Vector and array types inherit the alias set of their component
4294      type by default so we need to use a ref-all pointer if the data
4295      reference does not conflict with the created aggregated data
4296      reference because it is not addressable.  */
4297   bool need_ref_all = false;
4298   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4299                               get_alias_set (DR_REF (dr))))
4300     need_ref_all = true;
4301   /* Likewise for any of the data references in the stmt group.  */
4302   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4303     {
4304       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4305       do
4306         {
4307           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4308           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4309           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4310                                       get_alias_set (DR_REF (sdr))))
4311             {
4312               need_ref_all = true;
4313               break;
4314             }
4315           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4316         }
4317       while (orig_stmt);
4318     }
4319   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4320                                                need_ref_all);
4321   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4322
4323
4324   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4325      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4326      def-use update cycles for the pointer: one relative to the outer-loop
4327      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4328      to the inner-loop (which is the inner-most loop containing the dataref),
4329      and this is done be step (5) below.
4330
4331      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4332      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4333      redundant.  Steps (3),(4) create the following:
4334
4335         vp0 = &base_addr;
4336         LOOP:   vp1 = phi(vp0,vp2)
4337                 ...
4338                 ...
4339                 vp2 = vp1 + step
4340                 goto LOOP
4341
4342      If there is an inner-loop nested in loop, then step (5) will also be
4343      applied, and an additional update in the inner-loop will be created:
4344
4345         vp0 = &base_addr;
4346         LOOP:   vp1 = phi(vp0,vp2)
4347                 ...
4348         inner:     vp3 = phi(vp1,vp4)
4349                    vp4 = vp3 + inner_step
4350                    if () goto inner
4351                 ...
4352                 vp2 = vp1 + step
4353                 if () goto LOOP   */
4354
4355   /* (2) Calculate the initial address of the aggregate-pointer, and set
4356      the aggregate-pointer to point to it before the loop.  */
4357
4358   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4359
4360   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4361                                                    offset, byte_offset);
4362   if (new_stmt_list)
4363     {
4364       if (pe)
4365         {
4366           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4367           gcc_assert (!new_bb);
4368         }
4369       else
4370         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4371     }
4372
4373   *initial_address = new_temp;
4374   aggr_ptr_init = new_temp;
4375
4376   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4377      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4378      inner-loop nested in LOOP (during outer-loop vectorization).  */
4379
4380   /* No update in loop is required.  */
4381   if (only_init && (!loop_vinfo || at_loop == loop))
4382     aptr = aggr_ptr_init;
4383   else
4384     {
4385       /* The step of the aggregate pointer is the type size.  */
4386       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4387       /* One exception to the above is when the scalar step of the load in
4388          LOOP is zero. In this case the step here is also zero.  */
4389       if (*inv_p)
4390         iv_step = size_zero_node;
4391       else if (tree_int_cst_sgn (step) == -1)
4392         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4393
4394       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4395
4396       create_iv (aggr_ptr_init,
4397                  fold_convert (aggr_ptr_type, iv_step),
4398                  aggr_ptr, loop, &incr_gsi, insert_after,
4399                  &indx_before_incr, &indx_after_incr);
4400       incr = gsi_stmt (incr_gsi);
4401       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4402
4403       /* Copy the points-to information if it exists. */
4404       if (DR_PTR_INFO (dr))
4405         {
4406           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4407           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4408         }
4409       if (ptr_incr)
4410         *ptr_incr = incr;
4411
4412       aptr = indx_before_incr;
4413     }
4414
4415   if (!nested_in_vect_loop || only_init)
4416     return aptr;
4417
4418
4419   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4420      nested in LOOP, if exists.  */
4421
4422   gcc_assert (nested_in_vect_loop);
4423   if (!only_init)
4424     {
4425       standard_iv_increment_position (containing_loop, &incr_gsi,
4426                                       &insert_after);
4427       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4428                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4429                  &indx_after_incr);
4430       incr = gsi_stmt (incr_gsi);
4431       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4432
4433       /* Copy the points-to information if it exists. */
4434       if (DR_PTR_INFO (dr))
4435         {
4436           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4437           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4438         }
4439       if (ptr_incr)
4440         *ptr_incr = incr;
4441
4442       return indx_before_incr;
4443     }
4444   else
4445     gcc_unreachable ();
4446 }
4447
4448
4449 /* Function bump_vector_ptr
4450
4451    Increment a pointer (to a vector type) by vector-size. If requested,
4452    i.e. if PTR-INCR is given, then also connect the new increment stmt
4453    to the existing def-use update-chain of the pointer, by modifying
4454    the PTR_INCR as illustrated below:
4455
4456    The pointer def-use update-chain before this function:
4457                         DATAREF_PTR = phi (p_0, p_2)
4458                         ....
4459         PTR_INCR:       p_2 = DATAREF_PTR + step
4460
4461    The pointer def-use update-chain after this function:
4462                         DATAREF_PTR = phi (p_0, p_2)
4463                         ....
4464                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4465                         ....
4466         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4467
4468    Input:
4469    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4470                  in the loop.
4471    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4472               the loop.  The increment amount across iterations is expected
4473               to be vector_size.
4474    BSI - location where the new update stmt is to be placed.
4475    STMT - the original scalar memory-access stmt that is being vectorized.
4476    BUMP - optional. The offset by which to bump the pointer. If not given,
4477           the offset is assumed to be vector_size.
4478
4479    Output: Return NEW_DATAREF_PTR as illustrated above.
4480
4481 */
4482
4483 tree
4484 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4485                  gimple *stmt, tree bump)
4486 {
4487   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4488   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4489   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4490   tree update = TYPE_SIZE_UNIT (vectype);
4491   gassign *incr_stmt;
4492   ssa_op_iter iter;
4493   use_operand_p use_p;
4494   tree new_dataref_ptr;
4495
4496   if (bump)
4497     update = bump;
4498
4499   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4500     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4501   else
4502     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4503   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4504                                    dataref_ptr, update);
4505   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4506
4507   /* Copy the points-to information if it exists. */
4508   if (DR_PTR_INFO (dr))
4509     {
4510       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4511       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4512     }
4513
4514   if (!ptr_incr)
4515     return new_dataref_ptr;
4516
4517   /* Update the vector-pointer's cross-iteration increment.  */
4518   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4519     {
4520       tree use = USE_FROM_PTR (use_p);
4521
4522       if (use == dataref_ptr)
4523         SET_USE (use_p, new_dataref_ptr);
4524       else
4525         gcc_assert (tree_int_cst_compare (use, update) == 0);
4526     }
4527
4528   return new_dataref_ptr;
4529 }
4530
4531
4532 /* Function vect_create_destination_var.
4533
4534    Create a new temporary of type VECTYPE.  */
4535
4536 tree
4537 vect_create_destination_var (tree scalar_dest, tree vectype)
4538 {
4539   tree vec_dest;
4540   const char *name;
4541   char *new_name;
4542   tree type;
4543   enum vect_var_kind kind;
4544
4545   kind = vectype
4546     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4547     ? vect_mask_var
4548     : vect_simple_var
4549     : vect_scalar_var;
4550   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4551
4552   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4553
4554   name = get_name (scalar_dest);
4555   if (name)
4556     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4557   else
4558     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4559   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4560   free (new_name);
4561
4562   return vec_dest;
4563 }
4564
4565 /* Function vect_grouped_store_supported.
4566
4567    Returns TRUE if interleave high and interleave low permutations
4568    are supported, and FALSE otherwise.  */
4569
4570 bool
4571 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4572 {
4573   machine_mode mode = TYPE_MODE (vectype);
4574
4575   /* vect_permute_store_chain requires the group size to be equal to 3 or
4576      be a power of two.  */
4577   if (count != 3 && exact_log2 (count) == -1)
4578     {
4579       if (dump_enabled_p ())
4580         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4581                          "the size of the group of accesses"
4582                          " is not a power of 2 or not eqaul to 3\n");
4583       return false;
4584     }
4585
4586   /* Check that the permutation is supported.  */
4587   if (VECTOR_MODE_P (mode))
4588     {
4589       unsigned int i;
4590       if (count == 3)
4591         {
4592           unsigned int j0 = 0, j1 = 0, j2 = 0;
4593           unsigned int i, j;
4594
4595           unsigned int nelt;
4596           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4597             {
4598               if (dump_enabled_p ())
4599                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4600                                  "cannot handle groups of 3 stores for"
4601                                  " variable-length vectors\n");
4602               return false;
4603             }
4604
4605           vec_perm_builder sel (nelt, nelt, 1);
4606           sel.quick_grow (nelt);
4607           vec_perm_indices indices;
4608           for (j = 0; j < 3; j++)
4609             {
4610               int nelt0 = ((3 - j) * nelt) % 3;
4611               int nelt1 = ((3 - j) * nelt + 1) % 3;
4612               int nelt2 = ((3 - j) * nelt + 2) % 3;
4613               for (i = 0; i < nelt; i++)
4614                 {
4615                   if (3 * i + nelt0 < nelt)
4616                     sel[3 * i + nelt0] = j0++;
4617                   if (3 * i + nelt1 < nelt)
4618                     sel[3 * i + nelt1] = nelt + j1++;
4619                   if (3 * i + nelt2 < nelt)
4620                     sel[3 * i + nelt2] = 0;
4621                 }
4622               indices.new_vector (sel, 2, nelt);
4623               if (!can_vec_perm_const_p (mode, indices))
4624                 {
4625                   if (dump_enabled_p ())
4626                     dump_printf (MSG_MISSED_OPTIMIZATION,
4627                                  "permutation op not supported by target.\n");
4628                   return false;
4629                 }
4630
4631               for (i = 0; i < nelt; i++)
4632                 {
4633                   if (3 * i + nelt0 < nelt)
4634                     sel[3 * i + nelt0] = 3 * i + nelt0;
4635                   if (3 * i + nelt1 < nelt)
4636                     sel[3 * i + nelt1] = 3 * i + nelt1;
4637                   if (3 * i + nelt2 < nelt)
4638                     sel[3 * i + nelt2] = nelt + j2++;
4639                 }
4640               indices.new_vector (sel, 2, nelt);
4641               if (!can_vec_perm_const_p (mode, indices))
4642                 {
4643                   if (dump_enabled_p ())
4644                     dump_printf (MSG_MISSED_OPTIMIZATION,
4645                                  "permutation op not supported by target.\n");
4646                   return false;
4647                 }
4648             }
4649           return true;
4650         }
4651       else
4652         {
4653           /* If length is not equal to 3 then only power of 2 is supported.  */
4654           gcc_assert (pow2p_hwi (count));
4655           poly_uint64 nelt = GET_MODE_NUNITS (mode);
4656
4657           /* The encoding has 2 interleaved stepped patterns.  */
4658           vec_perm_builder sel (nelt, 2, 3);
4659           sel.quick_grow (6);
4660           for (i = 0; i < 3; i++)
4661             {
4662               sel[i * 2] = i;
4663               sel[i * 2 + 1] = i + nelt;
4664             }
4665           vec_perm_indices indices (sel, 2, nelt);
4666           if (can_vec_perm_const_p (mode, indices))
4667             {
4668               for (i = 0; i < 6; i++)
4669                 sel[i] += exact_div (nelt, 2);
4670               indices.new_vector (sel, 2, nelt);
4671               if (can_vec_perm_const_p (mode, indices))
4672                 return true;
4673             }
4674         }
4675     }
4676
4677   if (dump_enabled_p ())
4678     dump_printf (MSG_MISSED_OPTIMIZATION,
4679                  "permutaion op not supported by target.\n");
4680   return false;
4681 }
4682
4683
4684 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4685    type VECTYPE.  */
4686
4687 bool
4688 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4689 {
4690   return vect_lanes_optab_supported_p ("vec_store_lanes",
4691                                        vec_store_lanes_optab,
4692                                        vectype, count);
4693 }
4694
4695
4696 /* Function vect_permute_store_chain.
4697
4698    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4699    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4700    the data correctly for the stores.  Return the final references for stores
4701    in RESULT_CHAIN.
4702
4703    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4704    The input is 4 vectors each containing 8 elements.  We assign a number to
4705    each element, the input sequence is:
4706
4707    1st vec:   0  1  2  3  4  5  6  7
4708    2nd vec:   8  9 10 11 12 13 14 15
4709    3rd vec:  16 17 18 19 20 21 22 23
4710    4th vec:  24 25 26 27 28 29 30 31
4711
4712    The output sequence should be:
4713
4714    1st vec:  0  8 16 24  1  9 17 25
4715    2nd vec:  2 10 18 26  3 11 19 27
4716    3rd vec:  4 12 20 28  5 13 21 30
4717    4th vec:  6 14 22 30  7 15 23 31
4718
4719    i.e., we interleave the contents of the four vectors in their order.
4720
4721    We use interleave_high/low instructions to create such output.  The input of
4722    each interleave_high/low operation is two vectors:
4723    1st vec    2nd vec
4724    0 1 2 3    4 5 6 7
4725    the even elements of the result vector are obtained left-to-right from the
4726    high/low elements of the first vector.  The odd elements of the result are
4727    obtained left-to-right from the high/low elements of the second vector.
4728    The output of interleave_high will be:   0 4 1 5
4729    and of interleave_low:                   2 6 3 7
4730
4731
4732    The permutation is done in log LENGTH stages.  In each stage interleave_high
4733    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4734    where the first argument is taken from the first half of DR_CHAIN and the
4735    second argument from it's second half.
4736    In our example,
4737
4738    I1: interleave_high (1st vec, 3rd vec)
4739    I2: interleave_low (1st vec, 3rd vec)
4740    I3: interleave_high (2nd vec, 4th vec)
4741    I4: interleave_low (2nd vec, 4th vec)
4742
4743    The output for the first stage is:
4744
4745    I1:  0 16  1 17  2 18  3 19
4746    I2:  4 20  5 21  6 22  7 23
4747    I3:  8 24  9 25 10 26 11 27
4748    I4: 12 28 13 29 14 30 15 31
4749
4750    The output of the second stage, i.e. the final result is:
4751
4752    I1:  0  8 16 24  1  9 17 25
4753    I2:  2 10 18 26  3 11 19 27
4754    I3:  4 12 20 28  5 13 21 30
4755    I4:  6 14 22 30  7 15 23 31.  */
4756
4757 void
4758 vect_permute_store_chain (vec<tree> dr_chain,
4759                           unsigned int length,
4760                           gimple *stmt,
4761                           gimple_stmt_iterator *gsi,
4762                           vec<tree> *result_chain)
4763 {
4764   tree vect1, vect2, high, low;
4765   gimple *perm_stmt;
4766   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4767   tree perm_mask_low, perm_mask_high;
4768   tree data_ref;
4769   tree perm3_mask_low, perm3_mask_high;
4770   unsigned int i, j, n, log_length = exact_log2 (length);
4771
4772   result_chain->quick_grow (length);
4773   memcpy (result_chain->address (), dr_chain.address (),
4774           length * sizeof (tree));
4775
4776   if (length == 3)
4777     {
4778       /* vect_grouped_store_supported ensures that this is constant.  */
4779       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4780       unsigned int j0 = 0, j1 = 0, j2 = 0;
4781
4782       vec_perm_builder sel (nelt, nelt, 1);
4783       sel.quick_grow (nelt);
4784       vec_perm_indices indices;
4785       for (j = 0; j < 3; j++)
4786         {
4787           int nelt0 = ((3 - j) * nelt) % 3;
4788           int nelt1 = ((3 - j) * nelt + 1) % 3;
4789           int nelt2 = ((3 - j) * nelt + 2) % 3;
4790
4791           for (i = 0; i < nelt; i++)
4792             {
4793               if (3 * i + nelt0 < nelt)
4794                 sel[3 * i + nelt0] = j0++;
4795               if (3 * i + nelt1 < nelt)
4796                 sel[3 * i + nelt1] = nelt + j1++;
4797               if (3 * i + nelt2 < nelt)
4798                 sel[3 * i + nelt2] = 0;
4799             }
4800           indices.new_vector (sel, 2, nelt);
4801           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4802
4803           for (i = 0; i < nelt; i++)
4804             {
4805               if (3 * i + nelt0 < nelt)
4806                 sel[3 * i + nelt0] = 3 * i + nelt0;
4807               if (3 * i + nelt1 < nelt)
4808                 sel[3 * i + nelt1] = 3 * i + nelt1;
4809               if (3 * i + nelt2 < nelt)
4810                 sel[3 * i + nelt2] = nelt + j2++;
4811             }
4812           indices.new_vector (sel, 2, nelt);
4813           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4814
4815           vect1 = dr_chain[0];
4816           vect2 = dr_chain[1];
4817
4818           /* Create interleaving stmt:
4819              low = VEC_PERM_EXPR <vect1, vect2,
4820                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4821                                    j + 2, nelt + j + 2, *, ...}>  */
4822           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4823           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4824                                            vect2, perm3_mask_low);
4825           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4826
4827           vect1 = data_ref;
4828           vect2 = dr_chain[2];
4829           /* Create interleaving stmt:
4830              low = VEC_PERM_EXPR <vect1, vect2,
4831                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4832                                    6, 7, nelt + j + 2, ...}>  */
4833           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4834           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4835                                            vect2, perm3_mask_high);
4836           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4837           (*result_chain)[j] = data_ref;
4838         }
4839     }
4840   else
4841     {
4842       /* If length is not equal to 3 then only power of 2 is supported.  */
4843       gcc_assert (pow2p_hwi (length));
4844
4845       /* The encoding has 2 interleaved stepped patterns.  */
4846       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
4847       vec_perm_builder sel (nelt, 2, 3);
4848       sel.quick_grow (6);
4849       for (i = 0; i < 3; i++)
4850         {
4851           sel[i * 2] = i;
4852           sel[i * 2 + 1] = i + nelt;
4853         }
4854         vec_perm_indices indices (sel, 2, nelt);
4855         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4856
4857         for (i = 0; i < 6; i++)
4858           sel[i] += exact_div (nelt, 2);
4859         indices.new_vector (sel, 2, nelt);
4860         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4861
4862         for (i = 0, n = log_length; i < n; i++)
4863           {
4864             for (j = 0; j < length/2; j++)
4865               {
4866                 vect1 = dr_chain[j];
4867                 vect2 = dr_chain[j+length/2];
4868
4869                 /* Create interleaving stmt:
4870                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4871                                                         ...}>  */
4872                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4873                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4874                                                  vect2, perm_mask_high);
4875                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4876                 (*result_chain)[2*j] = high;
4877
4878                 /* Create interleaving stmt:
4879                    low = VEC_PERM_EXPR <vect1, vect2,
4880                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4881                                          ...}>  */
4882                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4883                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4884                                                  vect2, perm_mask_low);
4885                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4886                 (*result_chain)[2*j+1] = low;
4887               }
4888             memcpy (dr_chain.address (), result_chain->address (),
4889                     length * sizeof (tree));
4890           }
4891     }
4892 }
4893
4894 /* Function vect_setup_realignment
4895
4896    This function is called when vectorizing an unaligned load using
4897    the dr_explicit_realign[_optimized] scheme.
4898    This function generates the following code at the loop prolog:
4899
4900       p = initial_addr;
4901    x  msq_init = *(floor(p));   # prolog load
4902       realignment_token = call target_builtin;
4903     loop:
4904    x  msq = phi (msq_init, ---)
4905
4906    The stmts marked with x are generated only for the case of
4907    dr_explicit_realign_optimized.
4908
4909    The code above sets up a new (vector) pointer, pointing to the first
4910    location accessed by STMT, and a "floor-aligned" load using that pointer.
4911    It also generates code to compute the "realignment-token" (if the relevant
4912    target hook was defined), and creates a phi-node at the loop-header bb
4913    whose arguments are the result of the prolog-load (created by this
4914    function) and the result of a load that takes place in the loop (to be
4915    created by the caller to this function).
4916
4917    For the case of dr_explicit_realign_optimized:
4918    The caller to this function uses the phi-result (msq) to create the
4919    realignment code inside the loop, and sets up the missing phi argument,
4920    as follows:
4921     loop:
4922       msq = phi (msq_init, lsq)
4923       lsq = *(floor(p'));        # load in loop
4924       result = realign_load (msq, lsq, realignment_token);
4925
4926    For the case of dr_explicit_realign:
4927     loop:
4928       msq = *(floor(p));        # load in loop
4929       p' = p + (VS-1);
4930       lsq = *(floor(p'));       # load in loop
4931       result = realign_load (msq, lsq, realignment_token);
4932
4933    Input:
4934    STMT - (scalar) load stmt to be vectorized. This load accesses
4935           a memory location that may be unaligned.
4936    BSI - place where new code is to be inserted.
4937    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4938                               is used.
4939
4940    Output:
4941    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4942                        target hook, if defined.
4943    Return value - the result of the loop-header phi node.  */
4944
4945 tree
4946 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4947                         tree *realignment_token,
4948                         enum dr_alignment_support alignment_support_scheme,
4949                         tree init_addr,
4950                         struct loop **at_loop)
4951 {
4952   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4953   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4954   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4955   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4956   struct loop *loop = NULL;
4957   edge pe = NULL;
4958   tree scalar_dest = gimple_assign_lhs (stmt);
4959   tree vec_dest;
4960   gimple *inc;
4961   tree ptr;
4962   tree data_ref;
4963   basic_block new_bb;
4964   tree msq_init = NULL_TREE;
4965   tree new_temp;
4966   gphi *phi_stmt;
4967   tree msq = NULL_TREE;
4968   gimple_seq stmts = NULL;
4969   bool inv_p;
4970   bool compute_in_loop = false;
4971   bool nested_in_vect_loop = false;
4972   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4973   struct loop *loop_for_initial_load = NULL;
4974
4975   if (loop_vinfo)
4976     {
4977       loop = LOOP_VINFO_LOOP (loop_vinfo);
4978       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4979     }
4980
4981   gcc_assert (alignment_support_scheme == dr_explicit_realign
4982               || alignment_support_scheme == dr_explicit_realign_optimized);
4983
4984   /* We need to generate three things:
4985      1. the misalignment computation
4986      2. the extra vector load (for the optimized realignment scheme).
4987      3. the phi node for the two vectors from which the realignment is
4988       done (for the optimized realignment scheme).  */
4989
4990   /* 1. Determine where to generate the misalignment computation.
4991
4992      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4993      calculation will be generated by this function, outside the loop (in the
4994      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4995      caller, inside the loop.
4996
4997      Background: If the misalignment remains fixed throughout the iterations of
4998      the loop, then both realignment schemes are applicable, and also the
4999      misalignment computation can be done outside LOOP.  This is because we are
5000      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5001      are a multiple of VS (the Vector Size), and therefore the misalignment in
5002      different vectorized LOOP iterations is always the same.
5003      The problem arises only if the memory access is in an inner-loop nested
5004      inside LOOP, which is now being vectorized using outer-loop vectorization.
5005      This is the only case when the misalignment of the memory access may not
5006      remain fixed throughout the iterations of the inner-loop (as explained in
5007      detail in vect_supportable_dr_alignment).  In this case, not only is the
5008      optimized realignment scheme not applicable, but also the misalignment
5009      computation (and generation of the realignment token that is passed to
5010      REALIGN_LOAD) have to be done inside the loop.
5011
5012      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5013      or not, which in turn determines if the misalignment is computed inside
5014      the inner-loop, or outside LOOP.  */
5015
5016   if (init_addr != NULL_TREE || !loop_vinfo)
5017     {
5018       compute_in_loop = true;
5019       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5020     }
5021
5022
5023   /* 2. Determine where to generate the extra vector load.
5024
5025      For the optimized realignment scheme, instead of generating two vector
5026      loads in each iteration, we generate a single extra vector load in the
5027      preheader of the loop, and in each iteration reuse the result of the
5028      vector load from the previous iteration.  In case the memory access is in
5029      an inner-loop nested inside LOOP, which is now being vectorized using
5030      outer-loop vectorization, we need to determine whether this initial vector
5031      load should be generated at the preheader of the inner-loop, or can be
5032      generated at the preheader of LOOP.  If the memory access has no evolution
5033      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5034      to be generated inside LOOP (in the preheader of the inner-loop).  */
5035
5036   if (nested_in_vect_loop)
5037     {
5038       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5039       bool invariant_in_outerloop =
5040             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5041       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5042     }
5043   else
5044     loop_for_initial_load = loop;
5045   if (at_loop)
5046     *at_loop = loop_for_initial_load;
5047
5048   if (loop_for_initial_load)
5049     pe = loop_preheader_edge (loop_for_initial_load);
5050
5051   /* 3. For the case of the optimized realignment, create the first vector
5052       load at the loop preheader.  */
5053
5054   if (alignment_support_scheme == dr_explicit_realign_optimized)
5055     {
5056       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5057       gassign *new_stmt;
5058
5059       gcc_assert (!compute_in_loop);
5060       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5061       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5062                                       NULL_TREE, &init_addr, NULL, &inc,
5063                                       true, &inv_p);
5064       if (TREE_CODE (ptr) == SSA_NAME)
5065         new_temp = copy_ssa_name (ptr);
5066       else
5067         new_temp = make_ssa_name (TREE_TYPE (ptr));
5068       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5069       new_stmt = gimple_build_assign
5070                    (new_temp, BIT_AND_EXPR, ptr,
5071                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5072       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5073       gcc_assert (!new_bb);
5074       data_ref
5075         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5076                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5077       new_stmt = gimple_build_assign (vec_dest, data_ref);
5078       new_temp = make_ssa_name (vec_dest, new_stmt);
5079       gimple_assign_set_lhs (new_stmt, new_temp);
5080       if (pe)
5081         {
5082           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5083           gcc_assert (!new_bb);
5084         }
5085       else
5086          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5087
5088       msq_init = gimple_assign_lhs (new_stmt);
5089     }
5090
5091   /* 4. Create realignment token using a target builtin, if available.
5092       It is done either inside the containing loop, or before LOOP (as
5093       determined above).  */
5094
5095   if (targetm.vectorize.builtin_mask_for_load)
5096     {
5097       gcall *new_stmt;
5098       tree builtin_decl;
5099
5100       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5101       if (!init_addr)
5102         {
5103           /* Generate the INIT_ADDR computation outside LOOP.  */
5104           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5105                                                             NULL_TREE);
5106           if (loop)
5107             {
5108               pe = loop_preheader_edge (loop);
5109               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5110               gcc_assert (!new_bb);
5111             }
5112           else
5113              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5114         }
5115
5116       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5117       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5118       vec_dest =
5119         vect_create_destination_var (scalar_dest,
5120                                      gimple_call_return_type (new_stmt));
5121       new_temp = make_ssa_name (vec_dest, new_stmt);
5122       gimple_call_set_lhs (new_stmt, new_temp);
5123
5124       if (compute_in_loop)
5125         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5126       else
5127         {
5128           /* Generate the misalignment computation outside LOOP.  */
5129           pe = loop_preheader_edge (loop);
5130           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5131           gcc_assert (!new_bb);
5132         }
5133
5134       *realignment_token = gimple_call_lhs (new_stmt);
5135
5136       /* The result of the CALL_EXPR to this builtin is determined from
5137          the value of the parameter and no global variables are touched
5138          which makes the builtin a "const" function.  Requiring the
5139          builtin to have the "const" attribute makes it unnecessary
5140          to call mark_call_clobbered.  */
5141       gcc_assert (TREE_READONLY (builtin_decl));
5142     }
5143
5144   if (alignment_support_scheme == dr_explicit_realign)
5145     return msq;
5146
5147   gcc_assert (!compute_in_loop);
5148   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5149
5150
5151   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5152
5153   pe = loop_preheader_edge (containing_loop);
5154   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5155   msq = make_ssa_name (vec_dest);
5156   phi_stmt = create_phi_node (msq, containing_loop->header);
5157   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5158
5159   return msq;
5160 }
5161
5162
5163 /* Function vect_grouped_load_supported.
5164
5165    COUNT is the size of the load group (the number of statements plus the
5166    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5167    only one statement, with a gap of COUNT - 1.
5168
5169    Returns true if a suitable permute exists.  */
5170
5171 bool
5172 vect_grouped_load_supported (tree vectype, bool single_element_p,
5173                              unsigned HOST_WIDE_INT count)
5174 {
5175   machine_mode mode = TYPE_MODE (vectype);
5176
5177   /* If this is single-element interleaving with an element distance
5178      that leaves unused vector loads around punt - we at least create
5179      very sub-optimal code in that case (and blow up memory,
5180      see PR65518).  */
5181   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5182     {
5183       if (dump_enabled_p ())
5184         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5185                          "single-element interleaving not supported "
5186                          "for not adjacent vector loads\n");
5187       return false;
5188     }
5189
5190   /* vect_permute_load_chain requires the group size to be equal to 3 or
5191      be a power of two.  */
5192   if (count != 3 && exact_log2 (count) == -1)
5193     {
5194       if (dump_enabled_p ())
5195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5196                          "the size of the group of accesses"
5197                          " is not a power of 2 or not equal to 3\n");
5198       return false;
5199     }
5200
5201   /* Check that the permutation is supported.  */
5202   if (VECTOR_MODE_P (mode))
5203     {
5204       unsigned int i, j;
5205       if (count == 3)
5206         {
5207           unsigned int nelt;
5208           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5209             {
5210               if (dump_enabled_p ())
5211                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5212                                  "cannot handle groups of 3 loads for"
5213                                  " variable-length vectors\n");
5214               return false;
5215             }
5216
5217           vec_perm_builder sel (nelt, nelt, 1);
5218           sel.quick_grow (nelt);
5219           vec_perm_indices indices;
5220           unsigned int k;
5221           for (k = 0; k < 3; k++)
5222             {
5223               for (i = 0; i < nelt; i++)
5224                 if (3 * i + k < 2 * nelt)
5225                   sel[i] = 3 * i + k;
5226                 else
5227                   sel[i] = 0;
5228               indices.new_vector (sel, 2, nelt);
5229               if (!can_vec_perm_const_p (mode, indices))
5230                 {
5231                   if (dump_enabled_p ())
5232                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5233                                      "shuffle of 3 loads is not supported by"
5234                                      " target\n");
5235                   return false;
5236                 }
5237               for (i = 0, j = 0; i < nelt; i++)
5238                 if (3 * i + k < 2 * nelt)
5239                   sel[i] = i;
5240                 else
5241                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5242               indices.new_vector (sel, 2, nelt);
5243               if (!can_vec_perm_const_p (mode, indices))
5244                 {
5245                   if (dump_enabled_p ())
5246                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5247                                      "shuffle of 3 loads is not supported by"
5248                                      " target\n");
5249                   return false;
5250                 }
5251             }
5252           return true;
5253         }
5254       else
5255         {
5256           /* If length is not equal to 3 then only power of 2 is supported.  */
5257           gcc_assert (pow2p_hwi (count));
5258           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5259
5260           /* The encoding has a single stepped pattern.  */
5261           vec_perm_builder sel (nelt, 1, 3);
5262           sel.quick_grow (3);
5263           for (i = 0; i < 3; i++)
5264             sel[i] = i * 2;
5265           vec_perm_indices indices (sel, 2, nelt);
5266           if (can_vec_perm_const_p (mode, indices))
5267             {
5268               for (i = 0; i < 3; i++)
5269                 sel[i] = i * 2 + 1;
5270               indices.new_vector (sel, 2, nelt);
5271               if (can_vec_perm_const_p (mode, indices))
5272                 return true;
5273             }
5274         }
5275     }
5276
5277   if (dump_enabled_p ())
5278     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5279                      "extract even/odd not supported by target\n");
5280   return false;
5281 }
5282
5283 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5284    type VECTYPE.  */
5285
5286 bool
5287 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5288 {
5289   return vect_lanes_optab_supported_p ("vec_load_lanes",
5290                                        vec_load_lanes_optab,
5291                                        vectype, count);
5292 }
5293
5294 /* Function vect_permute_load_chain.
5295
5296    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5297    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5298    the input data correctly.  Return the final references for loads in
5299    RESULT_CHAIN.
5300
5301    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5302    The input is 4 vectors each containing 8 elements. We assign a number to each
5303    element, the input sequence is:
5304
5305    1st vec:   0  1  2  3  4  5  6  7
5306    2nd vec:   8  9 10 11 12 13 14 15
5307    3rd vec:  16 17 18 19 20 21 22 23
5308    4th vec:  24 25 26 27 28 29 30 31
5309
5310    The output sequence should be:
5311
5312    1st vec:  0 4  8 12 16 20 24 28
5313    2nd vec:  1 5  9 13 17 21 25 29
5314    3rd vec:  2 6 10 14 18 22 26 30
5315    4th vec:  3 7 11 15 19 23 27 31
5316
5317    i.e., the first output vector should contain the first elements of each
5318    interleaving group, etc.
5319
5320    We use extract_even/odd instructions to create such output.  The input of
5321    each extract_even/odd operation is two vectors
5322    1st vec    2nd vec
5323    0 1 2 3    4 5 6 7
5324
5325    and the output is the vector of extracted even/odd elements.  The output of
5326    extract_even will be:   0 2 4 6
5327    and of extract_odd:     1 3 5 7
5328
5329
5330    The permutation is done in log LENGTH stages.  In each stage extract_even
5331    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5332    their order.  In our example,
5333
5334    E1: extract_even (1st vec, 2nd vec)
5335    E2: extract_odd (1st vec, 2nd vec)
5336    E3: extract_even (3rd vec, 4th vec)
5337    E4: extract_odd (3rd vec, 4th vec)
5338
5339    The output for the first stage will be:
5340
5341    E1:  0  2  4  6  8 10 12 14
5342    E2:  1  3  5  7  9 11 13 15
5343    E3: 16 18 20 22 24 26 28 30
5344    E4: 17 19 21 23 25 27 29 31
5345
5346    In order to proceed and create the correct sequence for the next stage (or
5347    for the correct output, if the second stage is the last one, as in our
5348    example), we first put the output of extract_even operation and then the
5349    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5350    The input for the second stage is:
5351
5352    1st vec (E1):  0  2  4  6  8 10 12 14
5353    2nd vec (E3): 16 18 20 22 24 26 28 30
5354    3rd vec (E2):  1  3  5  7  9 11 13 15
5355    4th vec (E4): 17 19 21 23 25 27 29 31
5356
5357    The output of the second stage:
5358
5359    E1: 0 4  8 12 16 20 24 28
5360    E2: 2 6 10 14 18 22 26 30
5361    E3: 1 5  9 13 17 21 25 29
5362    E4: 3 7 11 15 19 23 27 31
5363
5364    And RESULT_CHAIN after reordering:
5365
5366    1st vec (E1):  0 4  8 12 16 20 24 28
5367    2nd vec (E3):  1 5  9 13 17 21 25 29
5368    3rd vec (E2):  2 6 10 14 18 22 26 30
5369    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5370
5371 static void
5372 vect_permute_load_chain (vec<tree> dr_chain,
5373                          unsigned int length,
5374                          gimple *stmt,
5375                          gimple_stmt_iterator *gsi,
5376                          vec<tree> *result_chain)
5377 {
5378   tree data_ref, first_vect, second_vect;
5379   tree perm_mask_even, perm_mask_odd;
5380   tree perm3_mask_low, perm3_mask_high;
5381   gimple *perm_stmt;
5382   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5383   unsigned int i, j, log_length = exact_log2 (length);
5384
5385   result_chain->quick_grow (length);
5386   memcpy (result_chain->address (), dr_chain.address (),
5387           length * sizeof (tree));
5388
5389   if (length == 3)
5390     {
5391       /* vect_grouped_load_supported ensures that this is constant.  */
5392       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5393       unsigned int k;
5394
5395       vec_perm_builder sel (nelt, nelt, 1);
5396       sel.quick_grow (nelt);
5397       vec_perm_indices indices;
5398       for (k = 0; k < 3; k++)
5399         {
5400           for (i = 0; i < nelt; i++)
5401             if (3 * i + k < 2 * nelt)
5402               sel[i] = 3 * i + k;
5403             else
5404               sel[i] = 0;
5405           indices.new_vector (sel, 2, nelt);
5406           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5407
5408           for (i = 0, j = 0; i < nelt; i++)
5409             if (3 * i + k < 2 * nelt)
5410               sel[i] = i;
5411             else
5412               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5413           indices.new_vector (sel, 2, nelt);
5414           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5415
5416           first_vect = dr_chain[0];
5417           second_vect = dr_chain[1];
5418
5419           /* Create interleaving stmt (low part of):
5420              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5421                                                              ...}>  */
5422           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5423           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5424                                            second_vect, perm3_mask_low);
5425           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5426
5427           /* Create interleaving stmt (high part of):
5428              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5429                                                               ...}>  */
5430           first_vect = data_ref;
5431           second_vect = dr_chain[2];
5432           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5433           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5434                                            second_vect, perm3_mask_high);
5435           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5436           (*result_chain)[k] = data_ref;
5437         }
5438     }
5439   else
5440     {
5441       /* If length is not equal to 3 then only power of 2 is supported.  */
5442       gcc_assert (pow2p_hwi (length));
5443
5444       /* The encoding has a single stepped pattern.  */
5445       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5446       vec_perm_builder sel (nelt, 1, 3);
5447       sel.quick_grow (3);
5448       for (i = 0; i < 3; ++i)
5449         sel[i] = i * 2;
5450       vec_perm_indices indices (sel, 2, nelt);
5451       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5452
5453       for (i = 0; i < 3; ++i)
5454         sel[i] = i * 2 + 1;
5455       indices.new_vector (sel, 2, nelt);
5456       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5457
5458       for (i = 0; i < log_length; i++)
5459         {
5460           for (j = 0; j < length; j += 2)
5461             {
5462               first_vect = dr_chain[j];
5463               second_vect = dr_chain[j+1];
5464
5465               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5466               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5467               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5468                                                first_vect, second_vect,
5469                                                perm_mask_even);
5470               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5471               (*result_chain)[j/2] = data_ref;
5472
5473               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5474               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5475               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5476                                                first_vect, second_vect,
5477                                                perm_mask_odd);
5478               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5479               (*result_chain)[j/2+length/2] = data_ref;
5480             }
5481           memcpy (dr_chain.address (), result_chain->address (),
5482                   length * sizeof (tree));
5483         }
5484     }
5485 }
5486
5487 /* Function vect_shift_permute_load_chain.
5488
5489    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5490    sequence of stmts to reorder the input data accordingly.
5491    Return the final references for loads in RESULT_CHAIN.
5492    Return true if successed, false otherwise.
5493
5494    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5495    The input is 3 vectors each containing 8 elements.  We assign a
5496    number to each element, the input sequence is:
5497
5498    1st vec:   0  1  2  3  4  5  6  7
5499    2nd vec:   8  9 10 11 12 13 14 15
5500    3rd vec:  16 17 18 19 20 21 22 23
5501
5502    The output sequence should be:
5503
5504    1st vec:  0 3 6  9 12 15 18 21
5505    2nd vec:  1 4 7 10 13 16 19 22
5506    3rd vec:  2 5 8 11 14 17 20 23
5507
5508    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5509
5510    First we shuffle all 3 vectors to get correct elements order:
5511
5512    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5513    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5514    3rd vec:  (16 19 22) (17 20 23) (18 21)
5515
5516    Next we unite and shift vector 3 times:
5517
5518    1st step:
5519      shift right by 6 the concatenation of:
5520      "1st vec" and  "2nd vec"
5521        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5522      "2nd vec" and  "3rd vec"
5523        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5524      "3rd vec" and  "1st vec"
5525        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5526                              | New vectors                   |
5527
5528      So that now new vectors are:
5529
5530      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5531      2nd vec:  (10 13) (16 19 22) (17 20 23)
5532      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5533
5534    2nd step:
5535      shift right by 5 the concatenation of:
5536      "1st vec" and  "3rd vec"
5537        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5538      "2nd vec" and  "1st vec"
5539        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5540      "3rd vec" and  "2nd vec"
5541        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5542                           | New vectors                   |
5543
5544      So that now new vectors are:
5545
5546      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5547      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5548      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5549
5550    3rd step:
5551      shift right by 5 the concatenation of:
5552      "1st vec" and  "1st vec"
5553        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5554      shift right by 3 the concatenation of:
5555      "2nd vec" and  "2nd vec"
5556                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5557                           | New vectors                   |
5558
5559      So that now all vectors are READY:
5560      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5561      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5562      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5563
5564    This algorithm is faster than one in vect_permute_load_chain if:
5565      1.  "shift of a concatination" is faster than general permutation.
5566          This is usually so.
5567      2.  The TARGET machine can't execute vector instructions in parallel.
5568          This is because each step of the algorithm depends on previous.
5569          The algorithm in vect_permute_load_chain is much more parallel.
5570
5571    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5572 */
5573
5574 static bool
5575 vect_shift_permute_load_chain (vec<tree> dr_chain,
5576                                unsigned int length,
5577                                gimple *stmt,
5578                                gimple_stmt_iterator *gsi,
5579                                vec<tree> *result_chain)
5580 {
5581   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5582   tree perm2_mask1, perm2_mask2, perm3_mask;
5583   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5584   gimple *perm_stmt;
5585
5586   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5587   unsigned int i;
5588   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5589   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5590
5591   unsigned HOST_WIDE_INT nelt, vf;
5592   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5593       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5594     /* Not supported for variable-length vectors.  */
5595     return false;
5596
5597   vec_perm_builder sel (nelt, nelt, 1);
5598   sel.quick_grow (nelt);
5599
5600   result_chain->quick_grow (length);
5601   memcpy (result_chain->address (), dr_chain.address (),
5602           length * sizeof (tree));
5603
5604   if (pow2p_hwi (length) && vf > 4)
5605     {
5606       unsigned int j, log_length = exact_log2 (length);
5607       for (i = 0; i < nelt / 2; ++i)
5608         sel[i] = i * 2;
5609       for (i = 0; i < nelt / 2; ++i)
5610         sel[nelt / 2 + i] = i * 2 + 1;
5611       vec_perm_indices indices (sel, 2, nelt);
5612       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5613         {
5614           if (dump_enabled_p ())
5615             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5616                              "shuffle of 2 fields structure is not \
5617                               supported by target\n");
5618           return false;
5619         }
5620       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5621
5622       for (i = 0; i < nelt / 2; ++i)
5623         sel[i] = i * 2 + 1;
5624       for (i = 0; i < nelt / 2; ++i)
5625         sel[nelt / 2 + i] = i * 2;
5626       indices.new_vector (sel, 2, nelt);
5627       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5628         {
5629           if (dump_enabled_p ())
5630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631                              "shuffle of 2 fields structure is not \
5632                               supported by target\n");
5633           return false;
5634         }
5635       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5636
5637       /* Generating permutation constant to shift all elements.
5638          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5639       for (i = 0; i < nelt; i++)
5640         sel[i] = nelt / 2 + i;
5641       indices.new_vector (sel, 2, nelt);
5642       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5643         {
5644           if (dump_enabled_p ())
5645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5646                              "shift permutation is not supported by target\n");
5647           return false;
5648         }
5649       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5650
5651       /* Generating permutation constant to select vector from 2.
5652          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5653       for (i = 0; i < nelt / 2; i++)
5654         sel[i] = i;
5655       for (i = nelt / 2; i < nelt; i++)
5656         sel[i] = nelt + i;
5657       indices.new_vector (sel, 2, nelt);
5658       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5659         {
5660           if (dump_enabled_p ())
5661             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5662                              "select is not supported by target\n");
5663           return false;
5664         }
5665       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5666
5667       for (i = 0; i < log_length; i++)
5668         {
5669           for (j = 0; j < length; j += 2)
5670             {
5671               first_vect = dr_chain[j];
5672               second_vect = dr_chain[j + 1];
5673
5674               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5675               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5676                                                first_vect, first_vect,
5677                                                perm2_mask1);
5678               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5679               vect[0] = data_ref;
5680
5681               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5682               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5683                                                second_vect, second_vect,
5684                                                perm2_mask2);
5685               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5686               vect[1] = data_ref;
5687
5688               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5689               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5690                                                vect[0], vect[1], shift1_mask);
5691               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5692               (*result_chain)[j/2 + length/2] = data_ref;
5693
5694               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5695               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5696                                                vect[0], vect[1], select_mask);
5697               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5698               (*result_chain)[j/2] = data_ref;
5699             }
5700           memcpy (dr_chain.address (), result_chain->address (),
5701                   length * sizeof (tree));
5702         }
5703       return true;
5704     }
5705   if (length == 3 && vf > 2)
5706     {
5707       unsigned int k = 0, l = 0;
5708
5709       /* Generating permutation constant to get all elements in rigth order.
5710          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5711       for (i = 0; i < nelt; i++)
5712         {
5713           if (3 * k + (l % 3) >= nelt)
5714             {
5715               k = 0;
5716               l += (3 - (nelt % 3));
5717             }
5718           sel[i] = 3 * k + (l % 3);
5719           k++;
5720         }
5721       vec_perm_indices indices (sel, 2, nelt);
5722       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5723         {
5724           if (dump_enabled_p ())
5725             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5726                              "shuffle of 3 fields structure is not \
5727                               supported by target\n");
5728           return false;
5729         }
5730       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5731
5732       /* Generating permutation constant to shift all elements.
5733          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5734       for (i = 0; i < nelt; i++)
5735         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5736       indices.new_vector (sel, 2, nelt);
5737       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5738         {
5739           if (dump_enabled_p ())
5740             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5741                              "shift permutation is not supported by target\n");
5742           return false;
5743         }
5744       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5745
5746       /* Generating permutation constant to shift all elements.
5747          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5748       for (i = 0; i < nelt; i++)
5749         sel[i] = 2 * (nelt / 3) + 1 + i;
5750       indices.new_vector (sel, 2, nelt);
5751       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5752         {
5753           if (dump_enabled_p ())
5754             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5755                              "shift permutation is not supported by target\n");
5756           return false;
5757         }
5758       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5759
5760       /* Generating permutation constant to shift all elements.
5761          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5762       for (i = 0; i < nelt; i++)
5763         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5764       indices.new_vector (sel, 2, nelt);
5765       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5766         {
5767           if (dump_enabled_p ())
5768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5769                              "shift permutation is not supported by target\n");
5770           return false;
5771         }
5772       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5773
5774       /* Generating permutation constant to shift all elements.
5775          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5776       for (i = 0; i < nelt; i++)
5777         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5778       indices.new_vector (sel, 2, nelt);
5779       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5780         {
5781           if (dump_enabled_p ())
5782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5783                              "shift permutation is not supported by target\n");
5784           return false;
5785         }
5786       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5787
5788       for (k = 0; k < 3; k++)
5789         {
5790           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5791           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5792                                            dr_chain[k], dr_chain[k],
5793                                            perm3_mask);
5794           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5795           vect[k] = data_ref;
5796         }
5797
5798       for (k = 0; k < 3; k++)
5799         {
5800           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5801           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5802                                            vect[k % 3], vect[(k + 1) % 3],
5803                                            shift1_mask);
5804           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5805           vect_shift[k] = data_ref;
5806         }
5807
5808       for (k = 0; k < 3; k++)
5809         {
5810           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5811           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5812                                            vect_shift[(4 - k) % 3],
5813                                            vect_shift[(3 - k) % 3],
5814                                            shift2_mask);
5815           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5816           vect[k] = data_ref;
5817         }
5818
5819       (*result_chain)[3 - (nelt % 3)] = vect[2];
5820
5821       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5822       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5823                                        vect[0], shift3_mask);
5824       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5825       (*result_chain)[nelt % 3] = data_ref;
5826
5827       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5828       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5829                                        vect[1], shift4_mask);
5830       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5831       (*result_chain)[0] = data_ref;
5832       return true;
5833     }
5834   return false;
5835 }
5836
5837 /* Function vect_transform_grouped_load.
5838
5839    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5840    to perform their permutation and ascribe the result vectorized statements to
5841    the scalar statements.
5842 */
5843
5844 void
5845 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5846                              gimple_stmt_iterator *gsi)
5847 {
5848   machine_mode mode;
5849   vec<tree> result_chain = vNULL;
5850
5851   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5852      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5853      vectors, that are ready for vector computation.  */
5854   result_chain.create (size);
5855
5856   /* If reassociation width for vector type is 2 or greater target machine can
5857      execute 2 or more vector instructions in parallel.  Otherwise try to
5858      get chain for loads group using vect_shift_permute_load_chain.  */
5859   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5860   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5861       || pow2p_hwi (size)
5862       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5863                                          gsi, &result_chain))
5864     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5865   vect_record_grouped_load_vectors (stmt, result_chain);
5866   result_chain.release ();
5867 }
5868
5869 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5870    generated as part of the vectorization of STMT.  Assign the statement
5871    for each vector to the associated scalar statement.  */
5872
5873 void
5874 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5875 {
5876   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5877   gimple *next_stmt, *new_stmt;
5878   unsigned int i, gap_count;
5879   tree tmp_data_ref;
5880
5881   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5882      Since we scan the chain starting from it's first node, their order
5883      corresponds the order of data-refs in RESULT_CHAIN.  */
5884   next_stmt = first_stmt;
5885   gap_count = 1;
5886   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5887     {
5888       if (!next_stmt)
5889         break;
5890
5891       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5892        code elimination pass later.  No need to check for the first stmt in
5893        the group, since it always exists.
5894        GROUP_GAP is the number of steps in elements from the previous
5895        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5896        correspond to the gaps.  */
5897       if (next_stmt != first_stmt
5898           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5899       {
5900         gap_count++;
5901         continue;
5902       }
5903
5904       while (next_stmt)
5905         {
5906           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5907           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5908              copies, and we put the new vector statement in the first available
5909              RELATED_STMT.  */
5910           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5911             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5912           else
5913             {
5914               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5915                 {
5916                   gimple *prev_stmt =
5917                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5918                   gimple *rel_stmt =
5919                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5920                   while (rel_stmt)
5921                     {
5922                       prev_stmt = rel_stmt;
5923                       rel_stmt =
5924                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5925                     }
5926
5927                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5928                     new_stmt;
5929                 }
5930             }
5931
5932           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5933           gap_count = 1;
5934           /* If NEXT_STMT accesses the same DR as the previous statement,
5935              put the same TMP_DATA_REF as its vectorized statement; otherwise
5936              get the next data-ref from RESULT_CHAIN.  */
5937           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5938             break;
5939         }
5940     }
5941 }
5942
5943 /* Function vect_force_dr_alignment_p.
5944
5945    Returns whether the alignment of a DECL can be forced to be aligned
5946    on ALIGNMENT bit boundary.  */
5947
5948 bool
5949 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5950 {
5951   if (!VAR_P (decl))
5952     return false;
5953
5954   if (decl_in_symtab_p (decl)
5955       && !symtab_node::get (decl)->can_increase_alignment_p ())
5956     return false;
5957
5958   if (TREE_STATIC (decl))
5959     return (alignment <= MAX_OFILE_ALIGNMENT);
5960   else
5961     return (alignment <= MAX_STACK_ALIGNMENT);
5962 }
5963
5964
5965 /* Return whether the data reference DR is supported with respect to its
5966    alignment.
5967    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5968    it is aligned, i.e., check if it is possible to vectorize it with different
5969    alignment.  */
5970
5971 enum dr_alignment_support
5972 vect_supportable_dr_alignment (struct data_reference *dr,
5973                                bool check_aligned_accesses)
5974 {
5975   gimple *stmt = DR_STMT (dr);
5976   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5977   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5978   machine_mode mode = TYPE_MODE (vectype);
5979   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5980   struct loop *vect_loop = NULL;
5981   bool nested_in_vect_loop = false;
5982
5983   if (aligned_access_p (dr) && !check_aligned_accesses)
5984     return dr_aligned;
5985
5986   /* For now assume all conditional loads/stores support unaligned
5987      access without any special code.  */
5988   if (is_gimple_call (stmt)
5989       && gimple_call_internal_p (stmt)
5990       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5991           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5992     return dr_unaligned_supported;
5993
5994   if (loop_vinfo)
5995     {
5996       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5997       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5998     }
5999
6000   /* Possibly unaligned access.  */
6001
6002   /* We can choose between using the implicit realignment scheme (generating
6003      a misaligned_move stmt) and the explicit realignment scheme (generating
6004      aligned loads with a REALIGN_LOAD).  There are two variants to the
6005      explicit realignment scheme: optimized, and unoptimized.
6006      We can optimize the realignment only if the step between consecutive
6007      vector loads is equal to the vector size.  Since the vector memory
6008      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6009      is guaranteed that the misalignment amount remains the same throughout the
6010      execution of the vectorized loop.  Therefore, we can create the
6011      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6012      at the loop preheader.
6013
6014      However, in the case of outer-loop vectorization, when vectorizing a
6015      memory access in the inner-loop nested within the LOOP that is now being
6016      vectorized, while it is guaranteed that the misalignment of the
6017      vectorized memory access will remain the same in different outer-loop
6018      iterations, it is *not* guaranteed that is will remain the same throughout
6019      the execution of the inner-loop.  This is because the inner-loop advances
6020      with the original scalar step (and not in steps of VS).  If the inner-loop
6021      step happens to be a multiple of VS, then the misalignment remains fixed
6022      and we can use the optimized realignment scheme.  For example:
6023
6024       for (i=0; i<N; i++)
6025         for (j=0; j<M; j++)
6026           s += a[i+j];
6027
6028      When vectorizing the i-loop in the above example, the step between
6029      consecutive vector loads is 1, and so the misalignment does not remain
6030      fixed across the execution of the inner-loop, and the realignment cannot
6031      be optimized (as illustrated in the following pseudo vectorized loop):
6032
6033       for (i=0; i<N; i+=4)
6034         for (j=0; j<M; j++){
6035           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6036                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6037                          // (assuming that we start from an aligned address).
6038           }
6039
6040      We therefore have to use the unoptimized realignment scheme:
6041
6042       for (i=0; i<N; i+=4)
6043           for (j=k; j<M; j+=4)
6044           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6045                            // that the misalignment of the initial address is
6046                            // 0).
6047
6048      The loop can then be vectorized as follows:
6049
6050       for (k=0; k<4; k++){
6051         rt = get_realignment_token (&vp[k]);
6052         for (i=0; i<N; i+=4){
6053           v1 = vp[i+k];
6054           for (j=k; j<M; j+=4){
6055             v2 = vp[i+j+VS-1];
6056             va = REALIGN_LOAD <v1,v2,rt>;
6057             vs += va;
6058             v1 = v2;
6059           }
6060         }
6061     } */
6062
6063   if (DR_IS_READ (dr))
6064     {
6065       bool is_packed = false;
6066       tree type = (TREE_TYPE (DR_REF (dr)));
6067
6068       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6069           && (!targetm.vectorize.builtin_mask_for_load
6070               || targetm.vectorize.builtin_mask_for_load ()))
6071         {
6072           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6073
6074           /* If we are doing SLP then the accesses need not have the
6075              same alignment, instead it depends on the SLP group size.  */
6076           if (loop_vinfo
6077               && STMT_SLP_TYPE (stmt_info)
6078               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6079                               * GROUP_SIZE (vinfo_for_stmt
6080                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6081                               TYPE_VECTOR_SUBPARTS (vectype)))
6082             ;
6083           else if (!loop_vinfo
6084                    || (nested_in_vect_loop
6085                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6086                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6087             return dr_explicit_realign;
6088           else
6089             return dr_explicit_realign_optimized;
6090         }
6091       if (!known_alignment_for_access_p (dr))
6092         is_packed = not_size_aligned (DR_REF (dr));
6093
6094       if (targetm.vectorize.support_vector_misalignment
6095             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6096         /* Can't software pipeline the loads, but can at least do them.  */
6097         return dr_unaligned_supported;
6098     }
6099   else
6100     {
6101       bool is_packed = false;
6102       tree type = (TREE_TYPE (DR_REF (dr)));
6103
6104       if (!known_alignment_for_access_p (dr))
6105         is_packed = not_size_aligned (DR_REF (dr));
6106
6107      if (targetm.vectorize.support_vector_misalignment
6108            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6109        return dr_unaligned_supported;
6110     }
6111
6112   /* Unsupported.  */
6113   return dr_unaligned_unsupported;
6114 }