gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Return true if load- or store-lanes optab OPTAB is implemented for
  58    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  59
  60 static bool
  61 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  62                               tree vectype, unsigned HOST_WIDE_INT count)
  63 {
  64   machine_mode mode;
  65   scalar_int_mode array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                           limit_p).exists (&array_mode))
  72     {
  73       if (dump_enabled_p ())
  74         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  75                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  76                          GET_MODE_NAME (mode), count);
  77       return false;
  78     }
  79
  80   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  81     {
  82       if (dump_enabled_p ())
  83         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  84                          "cannot use %s<%s><%s>\n", name,
  85                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  86       return false;
  87     }
  88
  89   if (dump_enabled_p ())
  90     dump_printf_loc (MSG_NOTE, vect_location,
  91                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  92                      GET_MODE_NAME (mode));
  93
  94   return true;
  95 }
  96
  97
  98 /* Return the smallest scalar part of STMT.
  99    This is used to determine the vectype of the stmt.  We generally set the
 100    vectype according to the type of the result (lhs).  For stmts whose
 101    result-type is different than the type of the arguments (e.g., demotion,
 102    promotion), vectype will be reset appropriately (later).  Note that we have
 103    to visit the smallest datatype in this function, because that determines the
 104    VF.  If the smallest datatype in the loop is present only as the rhs of a
 105    promotion operation - we'd miss it.
 106    Such a case, where a variable of this datatype does not appear in the lhs
 107    anywhere in the loop, can only occur if it's an invariant: e.g.:
 108    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 109    invariant motion.  However, we cannot rely on invariant motion to always
 110    take invariants out of the loop, and so in the case of promotion we also
 111    have to check the rhs.
 112    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 113    types.  */
 114
 115 tree
 116 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 117                                HOST_WIDE_INT *rhs_size_unit)
 118 {
 119   tree scalar_type = gimple_expr_type (stmt);
 120   HOST_WIDE_INT lhs, rhs;
 121
 122   /* During the analysis phase, this function is called on arbitrary
 123      statements that might not have scalar results.  */
 124   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 125     return scalar_type;
 126
 127   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 128
 129   if (is_gimple_assign (stmt)
 130       && (gimple_assign_cast_p (stmt)
 131           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 132           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 133           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 134     {
 135       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 136
 137       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 138       if (rhs < lhs)
 139         scalar_type = rhs_type;
 140     }
 141
 142   *lhs_size_unit = lhs;
 143   *rhs_size_unit = rhs;
 144   return scalar_type;
 145 }
 146
 147
 148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 149    tested at run-time.  Return TRUE if DDR was successfully inserted.
 150    Return false if versioning is not supported.  */
 151
 152 static bool
 153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 154 {
 155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 156
 157   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 158     return false;
 159
 160   if (!runtime_alias_check_p (ddr, loop,
 161                               optimize_loop_nest_for_speed_p (loop)))
 162     return false;
 163
 164   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 165   return true;
 166 }
 167
 168
 169 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 170    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 171    distances.  These distances are conservatively correct but they don't
 172    reflect a guaranteed dependence.
 173
 174    Return true if this function does all the work necessary to avoid
 175    an alias or false if the caller should use the dependence distances
 176    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 177    the depth of the loop described by LOOP_VINFO and the other arguments
 178    are as for vect_analyze_data_ref_dependence.  */
 179
 180 static bool
 181 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 182                                        loop_vec_info loop_vinfo,
 183                                        int loop_depth, unsigned int *max_vf)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   lambda_vector dist_v;
 187   unsigned int i;
 188   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 189     {
 190       int dist = dist_v[loop_depth];
 191       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 192         {
 193           /* If the user asserted safelen >= DIST consecutive iterations
 194              can be executed concurrently, assume independence.
 195
 196              ??? An alternative would be to add the alias check even
 197              in this case, and vectorize the fallback loop with the
 198              maximum VF set to safelen.  However, if the user has
 199              explicitly given a length, it's less likely that that
 200              would be a win.  */
 201           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 202             {
 203               if ((unsigned int) loop->safelen < *max_vf)
 204                 *max_vf = loop->safelen;
 205               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 206               continue;
 207             }
 208
 209           /* For dependence distances of 2 or more, we have the option
 210              of limiting VF or checking for an alias at runtime.
 211              Prefer to check at runtime if we can, to avoid limiting
 212              the VF unnecessarily when the bases are in fact independent.
 213
 214              Note that the alias checks will be removed if the VF ends up
 215              being small enough.  */
 216           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 217         }
 218     }
 219   return true;
 220 }
 221
 222
 223 /* Function vect_analyze_data_ref_dependence.
 224
 225    Return TRUE if there (might) exist a dependence between a memory-reference
 226    DRA and a memory-reference DRB.  When versioning for alias may check a
 227    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 228    the data dependence.  */
 229
 230 static bool
 231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 232                                   loop_vec_info loop_vinfo,
 233                                   unsigned int *max_vf)
 234 {
 235   unsigned int i;
 236   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 237   struct data_reference *dra = DDR_A (ddr);
 238   struct data_reference *drb = DDR_B (ddr);
 239   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 240   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 241   lambda_vector dist_v;
 242   unsigned int loop_depth;
 243
 244   /* In loop analysis all data references should be vectorizable.  */
 245   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 246       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 247     gcc_unreachable ();
 248
 249   /* Independent data accesses.  */
 250   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 251     return false;
 252
 253   if (dra == drb
 254       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 255     return false;
 256
 257   /* We do not have to consider dependences between accesses that belong
 258      to the same group.  */
 259   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 260       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 261     return false;
 262
 263   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 264      least two scalar iterations, there is always also a true dependence.
 265      As the vectorizer does not re-order loads and stores we can ignore
 266      the anti-dependence if TBAA can disambiguate both DRs similar to the
 267      case with known negative distance anti-dependences (positive
 268      distance anti-dependences would violate TBAA constraints).  */
 269   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 270        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 271       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 272                                  get_alias_set (DR_REF (drb))))
 273     return false;
 274
 275   /* Unknown data dependence.  */
 276   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 277     {
 278       /* If user asserted safelen consecutive iterations can be
 279          executed concurrently, assume independence.  */
 280       if (loop->safelen >= 2)
 281         {
 282           if ((unsigned int) loop->safelen < *max_vf)
 283             *max_vf = loop->safelen;
 284           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 285           return false;
 286         }
 287
 288       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 289           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 290         {
 291           if (dump_enabled_p ())
 292             {
 293               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 294                                "versioning for alias not supported for: "
 295                                "can't determine dependence between ");
 296               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 297                                  DR_REF (dra));
 298               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 299               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 300                                  DR_REF (drb));
 301               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 302             }
 303           return true;
 304         }
 305
 306       if (dump_enabled_p ())
 307         {
 308           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                            "versioning for alias required: "
 310                            "can't determine dependence between ");
 311           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                              DR_REF (dra));
 313           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                              DR_REF (drb));
 316           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317         }
 318
 319       /* Add to list of ddrs that need to be tested at run-time.  */
 320       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 321     }
 322
 323   /* Known data dependence.  */
 324   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 325     {
 326       /* If user asserted safelen consecutive iterations can be
 327          executed concurrently, assume independence.  */
 328       if (loop->safelen >= 2)
 329         {
 330           if ((unsigned int) loop->safelen < *max_vf)
 331             *max_vf = loop->safelen;
 332           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 333           return false;
 334         }
 335
 336       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 337           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 338         {
 339           if (dump_enabled_p ())
 340             {
 341               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 342                                "versioning for alias not supported for: "
 343                                "bad dist vector for ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (dra));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 347               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 348                                  DR_REF (drb));
 349               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 350             }
 351           return true;
 352         }
 353
 354       if (dump_enabled_p ())
 355         {
 356           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 357                            "versioning for alias required: "
 358                            "bad dist vector for ");
 359           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 360           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 361           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 362           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363         }
 364       /* Add to list of ddrs that need to be tested at run-time.  */
 365       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 366     }
 367
 368   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 369
 370   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 371       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 372                                                 loop_depth, max_vf))
 373     return false;
 374
 375   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 376     {
 377       int dist = dist_v[loop_depth];
 378
 379       if (dump_enabled_p ())
 380         dump_printf_loc (MSG_NOTE, vect_location,
 381                          "dependence distance  = %d.\n", dist);
 382
 383       if (dist == 0)
 384         {
 385           if (dump_enabled_p ())
 386             {
 387               dump_printf_loc (MSG_NOTE, vect_location,
 388                                "dependence distance == 0 between ");
 389               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 390               dump_printf (MSG_NOTE, " and ");
 391               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 392               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 393             }
 394
 395           /* When we perform grouped accesses and perform implicit CSE
 396              by detecting equal accesses and doing disambiguation with
 397              runtime alias tests like for
 398                 .. = a[i];
 399                 .. = a[i+1];
 400                 a[i] = ..;
 401                 a[i+1] = ..;
 402                 *p = ..;
 403                 .. = a[i];
 404                 .. = a[i+1];
 405              where we will end up loading { a[i], a[i+1] } once, make
 406              sure that inserting group loads before the first load and
 407              stores after the last store will do the right thing.
 408              Similar for groups like
 409                 a[i] = ...;
 410                 ... = a[i];
 411                 a[i+1] = ...;
 412              where loads from the group interleave with the store.  */
 413           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 414               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 415             {
 416               gimple *earlier_stmt;
 417               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 418               if (DR_IS_WRITE
 419                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 420                 {
 421                   if (dump_enabled_p ())
 422                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                      "READ_WRITE dependence in interleaving."
 424                                      "\n");
 425                   return true;
 426                 }
 427             }
 428
 429           continue;
 430         }
 431
 432       if (dist > 0 && DDR_REVERSED_P (ddr))
 433         {
 434           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 435              reversed (to make distance vector positive), and the actual
 436              distance is negative.  */
 437           if (dump_enabled_p ())
 438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 439                              "dependence distance negative.\n");
 440           /* Record a negative dependence distance to later limit the
 441              amount of stmt copying / unrolling we can perform.
 442              Only need to handle read-after-write dependence.  */
 443           if (DR_IS_READ (drb)
 444               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 445                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 446             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 447           continue;
 448         }
 449
 450       unsigned int abs_dist = abs (dist);
 451       if (abs_dist >= 2 && abs_dist < *max_vf)
 452         {
 453           /* The dependence distance requires reduction of the maximal
 454              vectorization factor.  */
 455           *max_vf = abs (dist);
 456           if (dump_enabled_p ())
 457             dump_printf_loc (MSG_NOTE, vect_location,
 458                              "adjusting maximal vectorization factor to %i\n",
 459                              *max_vf);
 460         }
 461
 462       if (abs_dist >= *max_vf)
 463         {
 464           /* Dependence distance does not create dependence, as far as
 465              vectorization is concerned, in this case.  */
 466           if (dump_enabled_p ())
 467             dump_printf_loc (MSG_NOTE, vect_location,
 468                              "dependence distance >= VF.\n");
 469           continue;
 470         }
 471
 472       if (dump_enabled_p ())
 473         {
 474           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                        "not vectorized, possible dependence "
 476                        "between data-refs ");
 477           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 478           dump_printf (MSG_NOTE,  " and ");
 479           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 480           dump_printf (MSG_NOTE,  "\n");
 481         }
 482
 483       return true;
 484     }
 485
 486   return false;
 487 }
 488
 489 /* Function vect_analyze_data_ref_dependences.
 490
 491    Examine all the data references in the loop, and make sure there do not
 492    exist any data dependences between them.  Set *MAX_VF according to
 493    the maximum vectorization factor the data dependences allow.  */
 494
 495 bool
 496 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 497                                    unsigned int *max_vf)
 498 {
 499   unsigned int i;
 500   struct data_dependence_relation *ddr;
 501
 502   if (dump_enabled_p ())
 503     dump_printf_loc (MSG_NOTE, vect_location,
 504                      "=== vect_analyze_data_ref_dependences ===\n");
 505
 506   LOOP_VINFO_DDRS (loop_vinfo)
 507     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 508              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 509   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 510   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 511   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 512                                 &LOOP_VINFO_DDRS (loop_vinfo),
 513                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 514     return false;
 515
 516   /* For epilogues we either have no aliases or alias versioning
 517      was applied to original loop.  Therefore we may just get max_vf
 518      using VF of original loop.  */
 519   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 520     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 521   else
 522     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 523       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 524         return false;
 525
 526   return true;
 527 }
 528
 529
 530 /* Function vect_slp_analyze_data_ref_dependence.
 531
 532    Return TRUE if there (might) exist a dependence between a memory-reference
 533    DRA and a memory-reference DRB.  When versioning for alias may check a
 534    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 535    the data dependence.  */
 536
 537 static bool
 538 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 539 {
 540   struct data_reference *dra = DDR_A (ddr);
 541   struct data_reference *drb = DDR_B (ddr);
 542
 543   /* We need to check dependences of statements marked as unvectorizable
 544      as well, they still can prohibit vectorization.  */
 545
 546   /* Independent data accesses.  */
 547   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 548     return false;
 549
 550   if (dra == drb)
 551     return false;
 552
 553   /* Read-read is OK.  */
 554   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 555     return false;
 556
 557   /* If dra and drb are part of the same interleaving chain consider
 558      them independent.  */
 559   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 560       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 561           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 562     return false;
 563
 564   /* Unknown data dependence.  */
 565   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 566     {
 567       if  (dump_enabled_p ())
 568         {
 569           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 570                            "can't determine dependence between ");
 571           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 572           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 573           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 574           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 575         }
 576     }
 577   else if (dump_enabled_p ())
 578     {
 579       dump_printf_loc (MSG_NOTE, vect_location,
 580                        "determined dependence between ");
 581       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 582       dump_printf (MSG_NOTE, " and ");
 583       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 584       dump_printf (MSG_NOTE,  "\n");
 585     }
 586
 587   return true;
 588 }
 589
 590
 591 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 592    contain the vector of scalar stores of this instance if we are
 593    disambiguating the loads.  */
 594
 595 static bool
 596 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 597                                    vec<gimple *> stores, gimple *last_store)
 598 {
 599   /* This walks over all stmts involved in the SLP load/store done
 600      in NODE verifying we can sink them up to the last stmt in the
 601      group.  */
 602   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 603   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 604     {
 605       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 606       if (access == last_access)
 607         continue;
 608       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 609       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 610            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 611         {
 612           gimple *stmt = gsi_stmt (gsi);
 613           if (! gimple_vuse (stmt)
 614               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 615             continue;
 616
 617           /* If we couldn't record a (single) data reference for this
 618              stmt we have to give up.  */
 619           /* ???  Here and below if dependence analysis fails we can resort
 620              to the alias oracle which can handle more kinds of stmts.  */
 621           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 622           if (!dr_b)
 623             return false;
 624
 625           bool dependent = false;
 626           /* If we run into a store of this same instance (we've just
 627              marked those) then delay dependence checking until we run
 628              into the last store because this is where it will have
 629              been sunk to (and we verify if we can do that as well).  */
 630           if (gimple_visited_p (stmt))
 631             {
 632               if (stmt != last_store)
 633                 continue;
 634               unsigned i;
 635               gimple *store;
 636               FOR_EACH_VEC_ELT (stores, i, store)
 637                 {
 638                   data_reference *store_dr
 639                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 640                   ddr_p ddr = initialize_data_dependence_relation
 641                                 (dr_a, store_dr, vNULL);
 642                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 643                   free_dependence_relation (ddr);
 644                   if (dependent)
 645                     break;
 646                 }
 647             }
 648           else
 649             {
 650               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 651                                                                dr_b, vNULL);
 652               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 653               free_dependence_relation (ddr);
 654             }
 655           if (dependent)
 656             return false;
 657         }
 658     }
 659   return true;
 660 }
 661
 662
 663 /* Function vect_analyze_data_ref_dependences.
 664
 665    Examine all the data references in the basic-block, and make sure there
 666    do not exist any data dependences between them.  Set *MAX_VF according to
 667    the maximum vectorization factor the data dependences allow.  */
 668
 669 bool
 670 vect_slp_analyze_instance_dependence (slp_instance instance)
 671 {
 672   if (dump_enabled_p ())
 673     dump_printf_loc (MSG_NOTE, vect_location,
 674                      "=== vect_slp_analyze_instance_dependence ===\n");
 675
 676   /* The stores of this instance are at the root of the SLP tree.  */
 677   slp_tree store = SLP_INSTANCE_TREE (instance);
 678   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 679     store = NULL;
 680
 681   /* Verify we can sink stores to the vectorized stmt insert location.  */
 682   gimple *last_store = NULL;
 683   if (store)
 684     {
 685       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 686         return false;
 687
 688       /* Mark stores in this instance and remember the last one.  */
 689       last_store = vect_find_last_scalar_stmt_in_slp (store);
 690       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 691         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 692     }
 693
 694   bool res = true;
 695
 696   /* Verify we can sink loads to the vectorized stmt insert location,
 697      special-casing stores of this instance.  */
 698   slp_tree load;
 699   unsigned int i;
 700   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 701     if (! vect_slp_analyze_node_dependences (instance, load,
 702                                              store
 703                                              ? SLP_TREE_SCALAR_STMTS (store)
 704                                              : vNULL, last_store))
 705       {
 706         res = false;
 707         break;
 708       }
 709
 710   /* Unset the visited flag.  */
 711   if (store)
 712     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 713       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 714
 715   return res;
 716 }
 717
 718 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 719    the statement that contains DRB, which is useful for recording in the
 720    dump file.  */
 721
 722 static void
 723 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 724                             innermost_loop_behavior *drb)
 725 {
 726   bool existed;
 727   innermost_loop_behavior *&entry
 728     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 729   if (!existed || entry->base_alignment < drb->base_alignment)
 730     {
 731       entry = drb;
 732       if (dump_enabled_p ())
 733         {
 734           dump_printf_loc (MSG_NOTE, vect_location,
 735                            "recording new base alignment for ");
 736           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 737           dump_printf (MSG_NOTE, "\n");
 738           dump_printf_loc (MSG_NOTE, vect_location,
 739                            "  alignment:    %d\n", drb->base_alignment);
 740           dump_printf_loc (MSG_NOTE, vect_location,
 741                            "  misalignment: %d\n", drb->base_misalignment);
 742           dump_printf_loc (MSG_NOTE, vect_location,
 743                            "  based on:     ");
 744           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 745         }
 746     }
 747 }
 748
 749 /* If the region we're going to vectorize is reached, all unconditional
 750    data references occur at least once.  We can therefore pool the base
 751    alignment guarantees from each unconditional reference.  Do this by
 752    going through all the data references in VINFO and checking whether
 753    the containing statement makes the reference unconditionally.  If so,
 754    record the alignment of the base address in VINFO so that it can be
 755    used for all other references with the same base.  */
 756
 757 void
 758 vect_record_base_alignments (vec_info *vinfo)
 759 {
 760   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 761   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 762   data_reference *dr;
 763   unsigned int i;
 764   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 765     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 766       {
 767         gimple *stmt = DR_STMT (dr);
 768         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 769
 770         /* If DR is nested in the loop that is being vectorized, we can also
 771            record the alignment of the base wrt the outer loop.  */
 772         if (loop && nested_in_vect_loop_p (loop, stmt))
 773           {
 774             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 775             vect_record_base_alignment
 776               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 777           }
 778       }
 779 }
 780
 781 /* Return the target alignment for the vectorized form of DR.  */
 782
 783 static unsigned int
 784 vect_calculate_target_alignment (struct data_reference *dr)
 785 {
 786   gimple *stmt = DR_STMT (dr);
 787   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 788   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 789   return targetm.vectorize.preferred_vector_alignment (vectype);
 790 }
 791
 792 /* Function vect_compute_data_ref_alignment
 793
 794    Compute the misalignment of the data reference DR.
 795
 796    Output:
 797    1. If during the misalignment computation it is found that the data reference
 798       cannot be vectorized then false is returned.
 799    2. DR_MISALIGNMENT (DR) is defined.
 800
 801    FOR NOW: No analysis is actually performed. Misalignment is calculated
 802    only for trivial cases. TODO.  */
 803
 804 bool
 805 vect_compute_data_ref_alignment (struct data_reference *dr)
 806 {
 807   gimple *stmt = DR_STMT (dr);
 808   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 809   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 810   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 811   struct loop *loop = NULL;
 812   tree ref = DR_REF (dr);
 813   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 814
 815   if (dump_enabled_p ())
 816     dump_printf_loc (MSG_NOTE, vect_location,
 817                      "vect_compute_data_ref_alignment:\n");
 818
 819   if (loop_vinfo)
 820     loop = LOOP_VINFO_LOOP (loop_vinfo);
 821
 822   /* Initialize misalignment to unknown.  */
 823   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 824
 825   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 826   bool step_preserves_misalignment_p;
 827
 828   unsigned HOST_WIDE_INT vector_alignment
 829     = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
 830   DR_TARGET_ALIGNMENT (dr) = vector_alignment;
 831
 832   /* No step for BB vectorization.  */
 833   if (!loop)
 834     {
 835       gcc_assert (integer_zerop (drb->step));
 836       step_preserves_misalignment_p = true;
 837     }
 838
 839   /* In case the dataref is in an inner-loop of the loop that is being
 840      vectorized (LOOP), we use the base and misalignment information
 841      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 842      stays the same throughout the execution of the inner-loop, which is why
 843      we have to check that the stride of the dataref in the inner-loop evenly
 844      divides by the vector alignment.  */
 845   else if (nested_in_vect_loop_p (loop, stmt))
 846     {
 847       step_preserves_misalignment_p
 848         = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
 849
 850       if (dump_enabled_p ())
 851         {
 852           if (step_preserves_misalignment_p)
 853             dump_printf_loc (MSG_NOTE, vect_location,
 854                              "inner step divides the vector alignment.\n");
 855           else
 856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 857                              "inner step doesn't divide the vector"
 858                              " alignment.\n");
 859         }
 860     }
 861
 862   /* Similarly we can only use base and misalignment information relative to
 863      an innermost loop if the misalignment stays the same throughout the
 864      execution of the loop.  As above, this is the case if the stride of
 865      the dataref evenly divides by the alignment.  */
 866   else
 867     {
 868       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 869       step_preserves_misalignment_p
 870         = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
 871
 872       if (!step_preserves_misalignment_p && dump_enabled_p ())
 873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 874                          "step doesn't divide the vector alignment.\n");
 875     }
 876
 877   unsigned int base_alignment = drb->base_alignment;
 878   unsigned int base_misalignment = drb->base_misalignment;
 879
 880   /* Calculate the maximum of the pooled base address alignment and the
 881      alignment that we can compute for DR itself.  */
 882   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 883   if (entry && base_alignment < (*entry)->base_alignment)
 884     {
 885       base_alignment = (*entry)->base_alignment;
 886       base_misalignment = (*entry)->base_misalignment;
 887     }
 888
 889   if (drb->offset_alignment < vector_alignment
 890       || !step_preserves_misalignment_p
 891       /* We need to know whether the step wrt the vectorized loop is
 892          negative when computing the starting misalignment below.  */
 893       || TREE_CODE (drb->step) != INTEGER_CST)
 894     {
 895       if (dump_enabled_p ())
 896         {
 897           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 898                            "Unknown alignment for access: ");
 899           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 900           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 901         }
 902       return true;
 903     }
 904
 905   if (base_alignment < vector_alignment)
 906     {
 907       tree base = drb->base_address;
 908       if (TREE_CODE (base) == ADDR_EXPR)
 909         base = TREE_OPERAND (base, 0);
 910       if (!vect_can_force_dr_alignment_p (base,
 911                                           vector_alignment * BITS_PER_UNIT))
 912         {
 913           if (dump_enabled_p ())
 914             {
 915               dump_printf_loc (MSG_NOTE, vect_location,
 916                                "can't force alignment of ref: ");
 917               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 918               dump_printf (MSG_NOTE, "\n");
 919             }
 920           return true;
 921         }
 922
 923       if (DECL_USER_ALIGN (base))
 924         {
 925           if (dump_enabled_p ())
 926             {
 927               dump_printf_loc (MSG_NOTE, vect_location,
 928                                "not forcing alignment of user-aligned "
 929                                "variable: ");
 930               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 931               dump_printf (MSG_NOTE, "\n");
 932             }
 933           return true;
 934         }
 935
 936       /* Force the alignment of the decl.
 937          NOTE: This is the only change to the code we make during
 938          the analysis phase, before deciding to vectorize the loop.  */
 939       if (dump_enabled_p ())
 940         {
 941           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 942           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 943           dump_printf (MSG_NOTE, "\n");
 944         }
 945
 946       DR_VECT_AUX (dr)->base_decl = base;
 947       DR_VECT_AUX (dr)->base_misaligned = true;
 948       base_misalignment = 0;
 949     }
 950   poly_int64 misalignment
 951     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
 952
 953   /* If this is a backward running DR then first access in the larger
 954      vectype actually is N-1 elements before the address in the DR.
 955      Adjust misalign accordingly.  */
 956   if (tree_int_cst_sgn (drb->step) < 0)
 957     /* PLUS because STEP is negative.  */
 958     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 959                      * TREE_INT_CST_LOW (drb->step));
 960
 961   unsigned int const_misalignment;
 962   if (!known_misalignment (misalignment, vector_alignment,
 963                            &const_misalignment))
 964     {
 965       if (dump_enabled_p ())
 966         {
 967           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 968                            "Non-constant misalignment for access: ");
 969           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 970           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 971         }
 972       return true;
 973     }
 974
 975   SET_DR_MISALIGNMENT (dr, const_misalignment);
 976
 977   if (dump_enabled_p ())
 978     {
 979       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 980                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 981       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 982       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 983     }
 984
 985   return true;
 986 }
 987
 988 /* Function vect_update_misalignment_for_peel.
 989    Sets DR's misalignment
 990    - to 0 if it has the same alignment as DR_PEEL,
 991    - to the misalignment computed using NPEEL if DR's salignment is known,
 992    - to -1 (unknown) otherwise.
 993
 994    DR - the data reference whose misalignment is to be adjusted.
 995    DR_PEEL - the data reference whose misalignment is being made
 996              zero in the vector loop by the peel.
 997    NPEEL - the number of iterations in the peel loop if the misalignment
 998            of DR_PEEL is known at compile time.  */
 999
1000 static void
1001 vect_update_misalignment_for_peel (struct data_reference *dr,
1002                                    struct data_reference *dr_peel, int npeel)
1003 {
1004   unsigned int i;
1005   vec<dr_p> same_aligned_drs;
1006   struct data_reference *current_dr;
1007   int dr_size = vect_get_scalar_dr_size (dr);
1008   int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1009   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1010   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1011
1012  /* For interleaved data accesses the step in the loop must be multiplied by
1013      the size of the interleaving group.  */
1014   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1015     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1016   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1017     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1018
1019   /* It can be assumed that the data refs with the same alignment as dr_peel
1020      are aligned in the vector loop.  */
1021   same_aligned_drs
1022     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1023   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1024     {
1025       if (current_dr != dr)
1026         continue;
1027       gcc_assert (!known_alignment_for_access_p (dr)
1028                   || !known_alignment_for_access_p (dr_peel)
1029                   || (DR_MISALIGNMENT (dr) / dr_size
1030                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1031       SET_DR_MISALIGNMENT (dr, 0);
1032       return;
1033     }
1034
1035   if (known_alignment_for_access_p (dr)
1036       && known_alignment_for_access_p (dr_peel))
1037     {
1038       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1039       int misal = DR_MISALIGNMENT (dr);
1040       misal += negative ? -npeel * dr_size : npeel * dr_size;
1041       misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1042       SET_DR_MISALIGNMENT (dr, misal);
1043       return;
1044     }
1045
1046   if (dump_enabled_p ())
1047     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1048                      "to unknown (-1).\n");
1049   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1050 }
1051
1052
1053 /* Function verify_data_ref_alignment
1054
1055    Return TRUE if DR can be handled with respect to alignment.  */
1056
1057 static bool
1058 verify_data_ref_alignment (data_reference_p dr)
1059 {
1060   enum dr_alignment_support supportable_dr_alignment
1061     = vect_supportable_dr_alignment (dr, false);
1062   if (!supportable_dr_alignment)
1063     {
1064       if (dump_enabled_p ())
1065         {
1066           if (DR_IS_READ (dr))
1067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1068                              "not vectorized: unsupported unaligned load.");
1069           else
1070             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1071                              "not vectorized: unsupported unaligned "
1072                              "store.");
1073
1074           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1075                              DR_REF (dr));
1076           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1077         }
1078       return false;
1079     }
1080
1081   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1082     dump_printf_loc (MSG_NOTE, vect_location,
1083                      "Vectorizing an unaligned access.\n");
1084
1085   return true;
1086 }
1087
1088 /* Function vect_verify_datarefs_alignment
1089
1090    Return TRUE if all data references in the loop can be
1091    handled with respect to alignment.  */
1092
1093 bool
1094 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1095 {
1096   vec<data_reference_p> datarefs = vinfo->datarefs;
1097   struct data_reference *dr;
1098   unsigned int i;
1099
1100   FOR_EACH_VEC_ELT (datarefs, i, dr)
1101     {
1102       gimple *stmt = DR_STMT (dr);
1103       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1104
1105       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1106         continue;
1107
1108       /* For interleaving, only the alignment of the first access matters.   */
1109       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1110           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1111         continue;
1112
1113       /* Strided accesses perform only component accesses, alignment is
1114          irrelevant for them.  */
1115       if (STMT_VINFO_STRIDED_P (stmt_info)
1116           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1117         continue;
1118
1119       if (! verify_data_ref_alignment (dr))
1120         return false;
1121     }
1122
1123   return true;
1124 }
1125
1126 /* Given an memory reference EXP return whether its alignment is less
1127    than its size.  */
1128
1129 static bool
1130 not_size_aligned (tree exp)
1131 {
1132   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1133     return true;
1134
1135   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1136           > get_object_alignment (exp));
1137 }
1138
1139 /* Function vector_alignment_reachable_p
1140
1141    Return true if vector alignment for DR is reachable by peeling
1142    a few loop iterations.  Return false otherwise.  */
1143
1144 static bool
1145 vector_alignment_reachable_p (struct data_reference *dr)
1146 {
1147   gimple *stmt = DR_STMT (dr);
1148   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1149   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1150
1151   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1152     {
1153       /* For interleaved access we peel only if number of iterations in
1154          the prolog loop ({VF - misalignment}), is a multiple of the
1155          number of the interleaved accesses.  */
1156       int elem_size, mis_in_elements;
1157
1158       /* FORNOW: handle only known alignment.  */
1159       if (!known_alignment_for_access_p (dr))
1160         return false;
1161
1162       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1163       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1164       elem_size = vector_element_size (vector_size, nelements);
1165       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1166
1167       if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1168         return false;
1169     }
1170
1171   /* If misalignment is known at the compile time then allow peeling
1172      only if natural alignment is reachable through peeling.  */
1173   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1174     {
1175       HOST_WIDE_INT elmsize =
1176                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1177       if (dump_enabled_p ())
1178         {
1179           dump_printf_loc (MSG_NOTE, vect_location,
1180                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1181           dump_printf (MSG_NOTE,
1182                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1183         }
1184       if (DR_MISALIGNMENT (dr) % elmsize)
1185         {
1186           if (dump_enabled_p ())
1187             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188                              "data size does not divide the misalignment.\n");
1189           return false;
1190         }
1191     }
1192
1193   if (!known_alignment_for_access_p (dr))
1194     {
1195       tree type = TREE_TYPE (DR_REF (dr));
1196       bool is_packed = not_size_aligned (DR_REF (dr));
1197       if (dump_enabled_p ())
1198         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                          "Unknown misalignment, %snaturally aligned\n",
1200                          is_packed ? "not " : "");
1201       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1202     }
1203
1204   return true;
1205 }
1206
1207
1208 /* Calculate the cost of the memory access represented by DR.  */
1209
1210 static void
1211 vect_get_data_access_cost (struct data_reference *dr,
1212                            unsigned int *inside_cost,
1213                            unsigned int *outside_cost,
1214                            stmt_vector_for_cost *body_cost_vec)
1215 {
1216   gimple *stmt = DR_STMT (dr);
1217   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219   int ncopies;
1220
1221   if (PURE_SLP_STMT (stmt_info))
1222     ncopies = 1;
1223   else
1224     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1225
1226   if (DR_IS_READ (dr))
1227     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1228                         NULL, body_cost_vec, false);
1229   else
1230     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1231
1232   if (dump_enabled_p ())
1233     dump_printf_loc (MSG_NOTE, vect_location,
1234                      "vect_get_data_access_cost: inside_cost = %d, "
1235                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1236 }
1237
1238
1239 typedef struct _vect_peel_info
1240 {
1241   struct data_reference *dr;
1242   int npeel;
1243   unsigned int count;
1244 } *vect_peel_info;
1245
1246 typedef struct _vect_peel_extended_info
1247 {
1248   struct _vect_peel_info peel_info;
1249   unsigned int inside_cost;
1250   unsigned int outside_cost;
1251 } *vect_peel_extended_info;
1252
1253
1254 /* Peeling hashtable helpers.  */
1255
1256 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1257 {
1258   static inline hashval_t hash (const _vect_peel_info *);
1259   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1260 };
1261
1262 inline hashval_t
1263 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1264 {
1265   return (hashval_t) peel_info->npeel;
1266 }
1267
1268 inline bool
1269 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1270 {
1271   return (a->npeel == b->npeel);
1272 }
1273
1274
1275 /* Insert DR into peeling hash table with NPEEL as key.  */
1276
1277 static void
1278 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1279                           loop_vec_info loop_vinfo, struct data_reference *dr,
1280                           int npeel)
1281 {
1282   struct _vect_peel_info elem, *slot;
1283   _vect_peel_info **new_slot;
1284   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1285
1286   elem.npeel = npeel;
1287   slot = peeling_htab->find (&elem);
1288   if (slot)
1289     slot->count++;
1290   else
1291     {
1292       slot = XNEW (struct _vect_peel_info);
1293       slot->npeel = npeel;
1294       slot->dr = dr;
1295       slot->count = 1;
1296       new_slot = peeling_htab->find_slot (slot, INSERT);
1297       *new_slot = slot;
1298     }
1299
1300   if (!supportable_dr_alignment
1301       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1302     slot->count += VECT_MAX_COST;
1303 }
1304
1305
1306 /* Traverse peeling hash table to find peeling option that aligns maximum
1307    number of data accesses.  */
1308
1309 int
1310 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1311                                      _vect_peel_extended_info *max)
1312 {
1313   vect_peel_info elem = *slot;
1314
1315   if (elem->count > max->peel_info.count
1316       || (elem->count == max->peel_info.count
1317           && max->peel_info.npeel > elem->npeel))
1318     {
1319       max->peel_info.npeel = elem->npeel;
1320       max->peel_info.count = elem->count;
1321       max->peel_info.dr = elem->dr;
1322     }
1323
1324   return 1;
1325 }
1326
1327 /* Get the costs of peeling NPEEL iterations checking data access costs
1328    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1329    misalignment will be zero after peeling.  */
1330
1331 static void
1332 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1333                                 struct data_reference *dr0,
1334                                 unsigned int *inside_cost,
1335                                 unsigned int *outside_cost,
1336                                 stmt_vector_for_cost *body_cost_vec,
1337                                 unsigned int npeel,
1338                                 bool unknown_misalignment)
1339 {
1340   unsigned i;
1341   data_reference *dr;
1342
1343   FOR_EACH_VEC_ELT (datarefs, i, dr)
1344     {
1345       gimple *stmt = DR_STMT (dr);
1346       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1347       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1348         continue;
1349
1350       /* For interleaving, only the alignment of the first access
1351          matters.  */
1352       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1353           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1354         continue;
1355
1356       /* Strided accesses perform only component accesses, alignment is
1357          irrelevant for them.  */
1358       if (STMT_VINFO_STRIDED_P (stmt_info)
1359           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1360         continue;
1361
1362       int save_misalignment;
1363       save_misalignment = DR_MISALIGNMENT (dr);
1364       if (npeel == 0)
1365         ;
1366       else if (unknown_misalignment && dr == dr0)
1367         SET_DR_MISALIGNMENT (dr, 0);
1368       else
1369         vect_update_misalignment_for_peel (dr, dr0, npeel);
1370       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1371                                  body_cost_vec);
1372       SET_DR_MISALIGNMENT (dr, save_misalignment);
1373     }
1374 }
1375
1376 /* Traverse peeling hash table and calculate cost for each peeling option.
1377    Find the one with the lowest cost.  */
1378
1379 int
1380 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1381                                    _vect_peel_extended_info *min)
1382 {
1383   vect_peel_info elem = *slot;
1384   int dummy;
1385   unsigned int inside_cost = 0, outside_cost = 0;
1386   gimple *stmt = DR_STMT (elem->dr);
1387   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1388   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1389   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1390                        epilogue_cost_vec;
1391
1392   prologue_cost_vec.create (2);
1393   body_cost_vec.create (2);
1394   epilogue_cost_vec.create (2);
1395
1396   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1397                                   elem->dr, &inside_cost, &outside_cost,
1398                                   &body_cost_vec, elem->npeel, false);
1399
1400   body_cost_vec.release ();
1401
1402   outside_cost += vect_get_known_peeling_cost
1403     (loop_vinfo, elem->npeel, &dummy,
1404      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1405      &prologue_cost_vec, &epilogue_cost_vec);
1406
1407   /* Prologue and epilogue costs are added to the target model later.
1408      These costs depend only on the scalar iteration cost, the
1409      number of peeling iterations finally chosen, and the number of
1410      misaligned statements.  So discard the information found here.  */
1411   prologue_cost_vec.release ();
1412   epilogue_cost_vec.release ();
1413
1414   if (inside_cost < min->inside_cost
1415       || (inside_cost == min->inside_cost
1416           && outside_cost < min->outside_cost))
1417     {
1418       min->inside_cost = inside_cost;
1419       min->outside_cost = outside_cost;
1420       min->peel_info.dr = elem->dr;
1421       min->peel_info.npeel = elem->npeel;
1422       min->peel_info.count = elem->count;
1423     }
1424
1425   return 1;
1426 }
1427
1428
1429 /* Choose best peeling option by traversing peeling hash table and either
1430    choosing an option with the lowest cost (if cost model is enabled) or the
1431    option that aligns as many accesses as possible.  */
1432
1433 static struct _vect_peel_extended_info
1434 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1435                                        loop_vec_info loop_vinfo)
1436 {
1437    struct _vect_peel_extended_info res;
1438
1439    res.peel_info.dr = NULL;
1440
1441    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1442      {
1443        res.inside_cost = INT_MAX;
1444        res.outside_cost = INT_MAX;
1445        peeling_htab->traverse <_vect_peel_extended_info *,
1446                                vect_peeling_hash_get_lowest_cost> (&res);
1447      }
1448    else
1449      {
1450        res.peel_info.count = 0;
1451        peeling_htab->traverse <_vect_peel_extended_info *,
1452                                vect_peeling_hash_get_most_frequent> (&res);
1453        res.inside_cost = 0;
1454        res.outside_cost = 0;
1455      }
1456
1457    return res;
1458 }
1459
1460 /* Return true if the new peeling NPEEL is supported.  */
1461
1462 static bool
1463 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1464                           unsigned npeel)
1465 {
1466   unsigned i;
1467   struct data_reference *dr = NULL;
1468   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1469   gimple *stmt;
1470   stmt_vec_info stmt_info;
1471   enum dr_alignment_support supportable_dr_alignment;
1472
1473   /* Ensure that all data refs can be vectorized after the peel.  */
1474   FOR_EACH_VEC_ELT (datarefs, i, dr)
1475     {
1476       int save_misalignment;
1477
1478       if (dr == dr0)
1479         continue;
1480
1481       stmt = DR_STMT (dr);
1482       stmt_info = vinfo_for_stmt (stmt);
1483       /* For interleaving, only the alignment of the first access
1484          matters.  */
1485       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1486           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1487         continue;
1488
1489       /* Strided accesses perform only component accesses, alignment is
1490          irrelevant for them.  */
1491       if (STMT_VINFO_STRIDED_P (stmt_info)
1492           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1493         continue;
1494
1495       save_misalignment = DR_MISALIGNMENT (dr);
1496       vect_update_misalignment_for_peel (dr, dr0, npeel);
1497       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1498       SET_DR_MISALIGNMENT (dr, save_misalignment);
1499
1500       if (!supportable_dr_alignment)
1501         return false;
1502     }
1503
1504   return true;
1505 }
1506
1507 /* Function vect_enhance_data_refs_alignment
1508
1509    This pass will use loop versioning and loop peeling in order to enhance
1510    the alignment of data references in the loop.
1511
1512    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1513    original loop is to be vectorized.  Any other loops that are created by
1514    the transformations performed in this pass - are not supposed to be
1515    vectorized.  This restriction will be relaxed.
1516
1517    This pass will require a cost model to guide it whether to apply peeling
1518    or versioning or a combination of the two.  For example, the scheme that
1519    intel uses when given a loop with several memory accesses, is as follows:
1520    choose one memory access ('p') which alignment you want to force by doing
1521    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1522    other accesses are not necessarily aligned, or (2) use loop versioning to
1523    generate one loop in which all accesses are aligned, and another loop in
1524    which only 'p' is necessarily aligned.
1525
1526    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1527    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1528    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1529
1530    Devising a cost model is the most critical aspect of this work.  It will
1531    guide us on which access to peel for, whether to use loop versioning, how
1532    many versions to create, etc.  The cost model will probably consist of
1533    generic considerations as well as target specific considerations (on
1534    powerpc for example, misaligned stores are more painful than misaligned
1535    loads).
1536
1537    Here are the general steps involved in alignment enhancements:
1538
1539      -- original loop, before alignment analysis:
1540         for (i=0; i<N; i++){
1541           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1542           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1543         }
1544
1545      -- After vect_compute_data_refs_alignment:
1546         for (i=0; i<N; i++){
1547           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1548           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1549         }
1550
1551      -- Possibility 1: we do loop versioning:
1552      if (p is aligned) {
1553         for (i=0; i<N; i++){    # loop 1A
1554           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1555           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1556         }
1557      }
1558      else {
1559         for (i=0; i<N; i++){    # loop 1B
1560           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1561           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1562         }
1563      }
1564
1565      -- Possibility 2: we do loop peeling:
1566      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1567         x = q[i];
1568         p[i] = y;
1569      }
1570      for (i = 3; i < N; i++){   # loop 2A
1571         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1572         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1573      }
1574
1575      -- Possibility 3: combination of loop peeling and versioning:
1576      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1577         x = q[i];
1578         p[i] = y;
1579      }
1580      if (p is aligned) {
1581         for (i = 3; i<N; i++){  # loop 3A
1582           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1583           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1584         }
1585      }
1586      else {
1587         for (i = 3; i<N; i++){  # loop 3B
1588           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1589           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1590         }
1591      }
1592
1593      These loops are later passed to loop_transform to be vectorized.  The
1594      vectorizer will use the alignment information to guide the transformation
1595      (whether to generate regular loads/stores, or with special handling for
1596      misalignment).  */
1597
1598 bool
1599 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1600 {
1601   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1602   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1603   enum dr_alignment_support supportable_dr_alignment;
1604   struct data_reference *dr0 = NULL, *first_store = NULL;
1605   struct data_reference *dr;
1606   unsigned int i, j;
1607   bool do_peeling = false;
1608   bool do_versioning = false;
1609   bool stat;
1610   gimple *stmt;
1611   stmt_vec_info stmt_info;
1612   unsigned int npeel = 0;
1613   bool one_misalignment_known = false;
1614   bool one_misalignment_unknown = false;
1615   bool one_dr_unsupportable = false;
1616   struct data_reference *unsupportable_dr = NULL;
1617   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1618   unsigned possible_npeel_number = 1;
1619   tree vectype;
1620   unsigned int mis, same_align_drs_max = 0;
1621   hash_table<peel_info_hasher> peeling_htab (1);
1622
1623   if (dump_enabled_p ())
1624     dump_printf_loc (MSG_NOTE, vect_location,
1625                      "=== vect_enhance_data_refs_alignment ===\n");
1626
1627   /* Reset data so we can safely be called multiple times.  */
1628   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1629   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1630
1631   /* While cost model enhancements are expected in the future, the high level
1632      view of the code at this time is as follows:
1633
1634      A) If there is a misaligned access then see if peeling to align
1635         this access can make all data references satisfy
1636         vect_supportable_dr_alignment.  If so, update data structures
1637         as needed and return true.
1638
1639      B) If peeling wasn't possible and there is a data reference with an
1640         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1641         then see if loop versioning checks can be used to make all data
1642         references satisfy vect_supportable_dr_alignment.  If so, update
1643         data structures as needed and return true.
1644
1645      C) If neither peeling nor versioning were successful then return false if
1646         any data reference does not satisfy vect_supportable_dr_alignment.
1647
1648      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1649
1650      Note, Possibility 3 above (which is peeling and versioning together) is not
1651      being done at this time.  */
1652
1653   /* (1) Peeling to force alignment.  */
1654
1655   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1656      Considerations:
1657      + How many accesses will become aligned due to the peeling
1658      - How many accesses will become unaligned due to the peeling,
1659        and the cost of misaligned accesses.
1660      - The cost of peeling (the extra runtime checks, the increase
1661        in code size).  */
1662
1663   FOR_EACH_VEC_ELT (datarefs, i, dr)
1664     {
1665       stmt = DR_STMT (dr);
1666       stmt_info = vinfo_for_stmt (stmt);
1667
1668       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1669         continue;
1670
1671       /* For interleaving, only the alignment of the first access
1672          matters.  */
1673       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1675         continue;
1676
1677       /* For invariant accesses there is nothing to enhance.  */
1678       if (integer_zerop (DR_STEP (dr)))
1679         continue;
1680
1681       /* Strided accesses perform only component accesses, alignment is
1682          irrelevant for them.  */
1683       if (STMT_VINFO_STRIDED_P (stmt_info)
1684           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1685         continue;
1686
1687       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1688       do_peeling = vector_alignment_reachable_p (dr);
1689       if (do_peeling)
1690         {
1691           if (known_alignment_for_access_p (dr))
1692             {
1693               unsigned int npeel_tmp = 0;
1694               bool negative = tree_int_cst_compare (DR_STEP (dr),
1695                                                     size_zero_node) < 0;
1696
1697               vectype = STMT_VINFO_VECTYPE (stmt_info);
1698               unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1699               unsigned int dr_size = vect_get_scalar_dr_size (dr);
1700               mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1701               if (DR_MISALIGNMENT (dr) != 0)
1702                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1703
1704               /* For multiple types, it is possible that the bigger type access
1705                  will have more than one peeling option.  E.g., a loop with two
1706                  types: one of size (vector size / 4), and the other one of
1707                  size (vector size / 8).  Vectorization factor will 8.  If both
1708                  accesses are misaligned by 3, the first one needs one scalar
1709                  iteration to be aligned, and the second one needs 5.  But the
1710                  first one will be aligned also by peeling 5 scalar
1711                  iterations, and in that case both accesses will be aligned.
1712                  Hence, except for the immediate peeling amount, we also want
1713                  to try to add full vector size, while we don't exceed
1714                  vectorization factor.
1715                  We do this automatically for cost model, since we calculate
1716                  cost for every peeling option.  */
1717               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1718                 {
1719                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1720                                           ? vf * GROUP_SIZE (stmt_info) : vf);
1721                   possible_npeel_number
1722                     = vect_get_num_vectors (nscalars, vectype);
1723
1724                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1725                      allow peeling NELEMENTS.  */
1726                   if (DR_MISALIGNMENT (dr) == 0)
1727                     possible_npeel_number++;
1728                 }
1729
1730               /* Save info about DR in the hash table.  Also include peeling
1731                  amounts according to the explanation above.  */
1732               for (j = 0; j < possible_npeel_number; j++)
1733                 {
1734                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1735                                             dr, npeel_tmp);
1736                   npeel_tmp += target_align / dr_size;
1737                 }
1738
1739               one_misalignment_known = true;
1740             }
1741           else
1742             {
1743               /* If we don't know any misalignment values, we prefer
1744                  peeling for data-ref that has the maximum number of data-refs
1745                  with the same alignment, unless the target prefers to align
1746                  stores over load.  */
1747               unsigned same_align_drs
1748                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1749               if (!dr0
1750                   || same_align_drs_max < same_align_drs)
1751                 {
1752                   same_align_drs_max = same_align_drs;
1753                   dr0 = dr;
1754                 }
1755               /* For data-refs with the same number of related
1756                  accesses prefer the one where the misalign
1757                  computation will be invariant in the outermost loop.  */
1758               else if (same_align_drs_max == same_align_drs)
1759                 {
1760                   struct loop *ivloop0, *ivloop;
1761                   ivloop0 = outermost_invariant_loop_for_expr
1762                     (loop, DR_BASE_ADDRESS (dr0));
1763                   ivloop = outermost_invariant_loop_for_expr
1764                     (loop, DR_BASE_ADDRESS (dr));
1765                   if ((ivloop && !ivloop0)
1766                       || (ivloop && ivloop0
1767                           && flow_loop_nested_p (ivloop, ivloop0)))
1768                     dr0 = dr;
1769                 }
1770
1771               one_misalignment_unknown = true;
1772
1773               /* Check for data refs with unsupportable alignment that
1774                  can be peeled.  */
1775               if (!supportable_dr_alignment)
1776               {
1777                 one_dr_unsupportable = true;
1778                 unsupportable_dr = dr;
1779               }
1780
1781               if (!first_store && DR_IS_WRITE (dr))
1782                 first_store = dr;
1783             }
1784         }
1785       else
1786         {
1787           if (!aligned_access_p (dr))
1788             {
1789               if (dump_enabled_p ())
1790                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1791                                  "vector alignment may not be reachable\n");
1792               break;
1793             }
1794         }
1795     }
1796
1797   /* Check if we can possibly peel the loop.  */
1798   if (!vect_can_advance_ivs_p (loop_vinfo)
1799       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1800       || loop->inner)
1801     do_peeling = false;
1802
1803   struct _vect_peel_extended_info peel_for_known_alignment;
1804   struct _vect_peel_extended_info peel_for_unknown_alignment;
1805   struct _vect_peel_extended_info best_peel;
1806
1807   peel_for_unknown_alignment.inside_cost = INT_MAX;
1808   peel_for_unknown_alignment.outside_cost = INT_MAX;
1809   peel_for_unknown_alignment.peel_info.count = 0;
1810
1811   if (do_peeling
1812       && one_misalignment_unknown)
1813     {
1814       /* Check if the target requires to prefer stores over loads, i.e., if
1815          misaligned stores are more expensive than misaligned loads (taking
1816          drs with same alignment into account).  */
1817       unsigned int load_inside_cost = 0;
1818       unsigned int load_outside_cost = 0;
1819       unsigned int store_inside_cost = 0;
1820       unsigned int store_outside_cost = 0;
1821       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1822
1823       stmt_vector_for_cost dummy;
1824       dummy.create (2);
1825       vect_get_peeling_costs_all_drs (datarefs, dr0,
1826                                       &load_inside_cost,
1827                                       &load_outside_cost,
1828                                       &dummy, estimated_npeels, true);
1829       dummy.release ();
1830
1831       if (first_store)
1832         {
1833           dummy.create (2);
1834           vect_get_peeling_costs_all_drs (datarefs, first_store,
1835                                           &store_inside_cost,
1836                                           &store_outside_cost,
1837                                           &dummy, estimated_npeels, true);
1838           dummy.release ();
1839         }
1840       else
1841         {
1842           store_inside_cost = INT_MAX;
1843           store_outside_cost = INT_MAX;
1844         }
1845
1846       if (load_inside_cost > store_inside_cost
1847           || (load_inside_cost == store_inside_cost
1848               && load_outside_cost > store_outside_cost))
1849         {
1850           dr0 = first_store;
1851           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1852           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1853         }
1854       else
1855         {
1856           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1857           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1858         }
1859
1860       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1861       prologue_cost_vec.create (2);
1862       epilogue_cost_vec.create (2);
1863
1864       int dummy2;
1865       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1866         (loop_vinfo, estimated_npeels, &dummy2,
1867          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1868          &prologue_cost_vec, &epilogue_cost_vec);
1869
1870       prologue_cost_vec.release ();
1871       epilogue_cost_vec.release ();
1872
1873       peel_for_unknown_alignment.peel_info.count = 1
1874         + STMT_VINFO_SAME_ALIGN_REFS
1875         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1876     }
1877
1878   peel_for_unknown_alignment.peel_info.npeel = 0;
1879   peel_for_unknown_alignment.peel_info.dr = dr0;
1880
1881   best_peel = peel_for_unknown_alignment;
1882
1883   peel_for_known_alignment.inside_cost = INT_MAX;
1884   peel_for_known_alignment.outside_cost = INT_MAX;
1885   peel_for_known_alignment.peel_info.count = 0;
1886   peel_for_known_alignment.peel_info.dr = NULL;
1887
1888   if (do_peeling && one_misalignment_known)
1889     {
1890       /* Peeling is possible, but there is no data access that is not supported
1891          unless aligned.  So we try to choose the best possible peeling from
1892          the hash table.  */
1893       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1894         (&peeling_htab, loop_vinfo);
1895     }
1896
1897   /* Compare costs of peeling for known and unknown alignment. */
1898   if (peel_for_known_alignment.peel_info.dr != NULL
1899       && peel_for_unknown_alignment.inside_cost
1900       >= peel_for_known_alignment.inside_cost)
1901     {
1902       best_peel = peel_for_known_alignment;
1903
1904       /* If the best peeling for known alignment has NPEEL == 0, perform no
1905          peeling at all except if there is an unsupportable dr that we can
1906          align.  */
1907       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1908         do_peeling = false;
1909     }
1910
1911   /* If there is an unsupportable data ref, prefer this over all choices so far
1912      since we'd have to discard a chosen peeling except when it accidentally
1913      aligned the unsupportable data ref.  */
1914   if (one_dr_unsupportable)
1915     dr0 = unsupportable_dr;
1916   else if (do_peeling)
1917     {
1918       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1919          TODO: Use nopeel_outside_cost or get rid of it?  */
1920       unsigned nopeel_inside_cost = 0;
1921       unsigned nopeel_outside_cost = 0;
1922
1923       stmt_vector_for_cost dummy;
1924       dummy.create (2);
1925       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1926                                       &nopeel_outside_cost, &dummy, 0, false);
1927       dummy.release ();
1928
1929       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1930          costs will be recorded.  */
1931       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1932       prologue_cost_vec.create (2);
1933       epilogue_cost_vec.create (2);
1934
1935       int dummy2;
1936       nopeel_outside_cost += vect_get_known_peeling_cost
1937         (loop_vinfo, 0, &dummy2,
1938          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1939          &prologue_cost_vec, &epilogue_cost_vec);
1940
1941       prologue_cost_vec.release ();
1942       epilogue_cost_vec.release ();
1943
1944       npeel = best_peel.peel_info.npeel;
1945       dr0 = best_peel.peel_info.dr;
1946
1947       /* If no peeling is not more expensive than the best peeling we
1948          have so far, don't perform any peeling.  */
1949       if (nopeel_inside_cost <= best_peel.inside_cost)
1950         do_peeling = false;
1951     }
1952
1953   if (do_peeling)
1954     {
1955       stmt = DR_STMT (dr0);
1956       stmt_info = vinfo_for_stmt (stmt);
1957       vectype = STMT_VINFO_VECTYPE (stmt_info);
1958
1959       if (known_alignment_for_access_p (dr0))
1960         {
1961           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1962                                                 size_zero_node) < 0;
1963           if (!npeel)
1964             {
1965               /* Since it's known at compile time, compute the number of
1966                  iterations in the peeled loop (the peeling factor) for use in
1967                  updating DR_MISALIGNMENT values.  The peeling factor is the
1968                  vectorization factor minus the misalignment as an element
1969                  count.  */
1970               mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1971               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1972               npeel = ((mis & (target_align - 1))
1973                        / vect_get_scalar_dr_size (dr0));
1974             }
1975
1976           /* For interleaved data access every iteration accesses all the
1977              members of the group, therefore we divide the number of iterations
1978              by the group size.  */
1979           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1980           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1981             npeel /= GROUP_SIZE (stmt_info);
1982
1983           if (dump_enabled_p ())
1984             dump_printf_loc (MSG_NOTE, vect_location,
1985                              "Try peeling by %d\n", npeel);
1986         }
1987
1988       /* Ensure that all datarefs can be vectorized after the peel.  */
1989       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1990         do_peeling = false;
1991
1992       /* Check if all datarefs are supportable and log.  */
1993       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1994         {
1995           stat = vect_verify_datarefs_alignment (loop_vinfo);
1996           if (!stat)
1997             do_peeling = false;
1998           else
1999             return stat;
2000         }
2001
2002       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2003       if (do_peeling)
2004         {
2005           unsigned max_allowed_peel
2006             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2007           if (max_allowed_peel != (unsigned)-1)
2008             {
2009               unsigned max_peel = npeel;
2010               if (max_peel == 0)
2011                 {
2012                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2013                   max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2014                 }
2015               if (max_peel > max_allowed_peel)
2016                 {
2017                   do_peeling = false;
2018                   if (dump_enabled_p ())
2019                     dump_printf_loc (MSG_NOTE, vect_location,
2020                         "Disable peeling, max peels reached: %d\n", max_peel);
2021                 }
2022             }
2023         }
2024
2025       /* Cost model #2 - if peeling may result in a remaining loop not
2026          iterating enough to be vectorized then do not peel.  Since this
2027          is a cost heuristic rather than a correctness decision, use the
2028          most likely runtime value for variable vectorization factors.  */
2029       if (do_peeling
2030           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2031         {
2032           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2033           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2034           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2035               < assumed_vf + max_peel)
2036             do_peeling = false;
2037         }
2038
2039       if (do_peeling)
2040         {
2041           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2042              If the misalignment of DR_i is identical to that of dr0 then set
2043              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2044              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2045              by the peeling factor times the element size of DR_i (MOD the
2046              vectorization factor times the size).  Otherwise, the
2047              misalignment of DR_i must be set to unknown.  */
2048           FOR_EACH_VEC_ELT (datarefs, i, dr)
2049             if (dr != dr0)
2050               {
2051                 /* Strided accesses perform only component accesses, alignment
2052                    is irrelevant for them.  */
2053                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2054                 if (STMT_VINFO_STRIDED_P (stmt_info)
2055                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2056                   continue;
2057
2058                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2059               }
2060
2061           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2062           if (npeel)
2063             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2064           else
2065             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2066               = DR_MISALIGNMENT (dr0);
2067           SET_DR_MISALIGNMENT (dr0, 0);
2068           if (dump_enabled_p ())
2069             {
2070               dump_printf_loc (MSG_NOTE, vect_location,
2071                                "Alignment of access forced using peeling.\n");
2072               dump_printf_loc (MSG_NOTE, vect_location,
2073                                "Peeling for alignment will be applied.\n");
2074             }
2075
2076           /* The inside-loop cost will be accounted for in vectorizable_load
2077              and vectorizable_store correctly with adjusted alignments.
2078              Drop the body_cst_vec on the floor here.  */
2079           stat = vect_verify_datarefs_alignment (loop_vinfo);
2080           gcc_assert (stat);
2081           return stat;
2082         }
2083     }
2084
2085   /* (2) Versioning to force alignment.  */
2086
2087   /* Try versioning if:
2088      1) optimize loop for speed
2089      2) there is at least one unsupported misaligned data ref with an unknown
2090         misalignment, and
2091      3) all misaligned data refs with a known misalignment are supported, and
2092      4) the number of runtime alignment checks is within reason.  */
2093
2094   do_versioning =
2095         optimize_loop_nest_for_speed_p (loop)
2096         && (!loop->inner); /* FORNOW */
2097
2098   if (do_versioning)
2099     {
2100       FOR_EACH_VEC_ELT (datarefs, i, dr)
2101         {
2102           stmt = DR_STMT (dr);
2103           stmt_info = vinfo_for_stmt (stmt);
2104
2105           /* For interleaving, only the alignment of the first access
2106              matters.  */
2107           if (aligned_access_p (dr)
2108               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2109                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2110             continue;
2111
2112           if (STMT_VINFO_STRIDED_P (stmt_info))
2113             {
2114               /* Strided loads perform only component accesses, alignment is
2115                  irrelevant for them.  */
2116               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2117                 continue;
2118               do_versioning = false;
2119               break;
2120             }
2121
2122           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2123
2124           if (!supportable_dr_alignment)
2125             {
2126               gimple *stmt;
2127               int mask;
2128               tree vectype;
2129
2130               if (known_alignment_for_access_p (dr)
2131                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2132                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2133                 {
2134                   do_versioning = false;
2135                   break;
2136                 }
2137
2138               stmt = DR_STMT (dr);
2139               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2140               gcc_assert (vectype);
2141
2142               /* The rightmost bits of an aligned address must be zeros.
2143                  Construct the mask needed for this test.  For example,
2144                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2145                  mask must be 15 = 0xf. */
2146               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2147
2148               /* FORNOW: use the same mask to test all potentially unaligned
2149                  references in the loop.  The vectorizer currently supports
2150                  a single vector size, see the reference to
2151                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2152                  vectorization factor is computed.  */
2153               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2154                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2155               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2156               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2157                       DR_STMT (dr));
2158             }
2159         }
2160
2161       /* Versioning requires at least one misaligned data reference.  */
2162       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2163         do_versioning = false;
2164       else if (!do_versioning)
2165         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2166     }
2167
2168   if (do_versioning)
2169     {
2170       vec<gimple *> may_misalign_stmts
2171         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2172       gimple *stmt;
2173
2174       /* It can now be assumed that the data references in the statements
2175          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2176          of the loop being vectorized.  */
2177       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2178         {
2179           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2180           dr = STMT_VINFO_DATA_REF (stmt_info);
2181           SET_DR_MISALIGNMENT (dr, 0);
2182           if (dump_enabled_p ())
2183             dump_printf_loc (MSG_NOTE, vect_location,
2184                              "Alignment of access forced using versioning.\n");
2185         }
2186
2187       if (dump_enabled_p ())
2188         dump_printf_loc (MSG_NOTE, vect_location,
2189                          "Versioning for alignment will be applied.\n");
2190
2191       /* Peeling and versioning can't be done together at this time.  */
2192       gcc_assert (! (do_peeling && do_versioning));
2193
2194       stat = vect_verify_datarefs_alignment (loop_vinfo);
2195       gcc_assert (stat);
2196       return stat;
2197     }
2198
2199   /* This point is reached if neither peeling nor versioning is being done.  */
2200   gcc_assert (! (do_peeling || do_versioning));
2201
2202   stat = vect_verify_datarefs_alignment (loop_vinfo);
2203   return stat;
2204 }
2205
2206
2207 /* Function vect_find_same_alignment_drs.
2208
2209    Update group and alignment relations according to the chosen
2210    vectorization factor.  */
2211
2212 static void
2213 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2214 {
2215   struct data_reference *dra = DDR_A (ddr);
2216   struct data_reference *drb = DDR_B (ddr);
2217   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2218   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2219
2220   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2221     return;
2222
2223   if (dra == drb)
2224     return;
2225
2226   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2227       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2228       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2229     return;
2230
2231   /* Two references with distance zero have the same alignment.  */
2232   offset_int diff = (wi::to_offset (DR_INIT (dra))
2233                      - wi::to_offset (DR_INIT (drb)));
2234   if (diff != 0)
2235     {
2236       /* Get the wider of the two alignments.  */
2237       unsigned int align_a = (vect_calculate_target_alignment (dra)
2238                               / BITS_PER_UNIT);
2239       unsigned int align_b = (vect_calculate_target_alignment (drb)
2240                               / BITS_PER_UNIT);
2241       unsigned int max_align = MAX (align_a, align_b);
2242
2243       /* Require the gap to be a multiple of the larger vector alignment.  */
2244       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2245         return;
2246     }
2247
2248   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2249   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2250   if (dump_enabled_p ())
2251     {
2252       dump_printf_loc (MSG_NOTE, vect_location,
2253                        "accesses have the same alignment: ");
2254       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2255       dump_printf (MSG_NOTE,  " and ");
2256       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2257       dump_printf (MSG_NOTE, "\n");
2258     }
2259 }
2260
2261
2262 /* Function vect_analyze_data_refs_alignment
2263
2264    Analyze the alignment of the data-references in the loop.
2265    Return FALSE if a data reference is found that cannot be vectorized.  */
2266
2267 bool
2268 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2269 {
2270   if (dump_enabled_p ())
2271     dump_printf_loc (MSG_NOTE, vect_location,
2272                      "=== vect_analyze_data_refs_alignment ===\n");
2273
2274   /* Mark groups of data references with same alignment using
2275      data dependence information.  */
2276   vec<ddr_p> ddrs = vinfo->ddrs;
2277   struct data_dependence_relation *ddr;
2278   unsigned int i;
2279
2280   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2281     vect_find_same_alignment_drs (ddr);
2282
2283   vec<data_reference_p> datarefs = vinfo->datarefs;
2284   struct data_reference *dr;
2285
2286   vect_record_base_alignments (vinfo);
2287   FOR_EACH_VEC_ELT (datarefs, i, dr)
2288     {
2289       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2290       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2291           && !vect_compute_data_ref_alignment (dr))
2292         {
2293           /* Strided accesses perform only component accesses, misalignment
2294              information is irrelevant for them.  */
2295           if (STMT_VINFO_STRIDED_P (stmt_info)
2296               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2297             continue;
2298
2299           if (dump_enabled_p ())
2300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                              "not vectorized: can't calculate alignment "
2302                              "for data ref.\n");
2303
2304           return false;
2305         }
2306     }
2307
2308   return true;
2309 }
2310
2311
2312 /* Analyze alignment of DRs of stmts in NODE.  */
2313
2314 static bool
2315 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2316 {
2317   /* We vectorize from the first scalar stmt in the node unless
2318      the node is permuted in which case we start from the first
2319      element in the group.  */
2320   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2321   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2322   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2323     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2324
2325   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2326   if (! vect_compute_data_ref_alignment (dr)
2327       /* For creating the data-ref pointer we need alignment of the
2328          first element anyway.  */
2329       || (dr != first_dr
2330           && ! vect_compute_data_ref_alignment (first_dr))
2331       || ! verify_data_ref_alignment (dr))
2332     {
2333       if (dump_enabled_p ())
2334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2335                          "not vectorized: bad data alignment in basic "
2336                          "block.\n");
2337       return false;
2338     }
2339
2340   return true;
2341 }
2342
2343 /* Function vect_slp_analyze_instance_alignment
2344
2345    Analyze the alignment of the data-references in the SLP instance.
2346    Return FALSE if a data reference is found that cannot be vectorized.  */
2347
2348 bool
2349 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2350 {
2351   if (dump_enabled_p ())
2352     dump_printf_loc (MSG_NOTE, vect_location,
2353                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2354
2355   slp_tree node;
2356   unsigned i;
2357   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2358     if (! vect_slp_analyze_and_verify_node_alignment (node))
2359       return false;
2360
2361   node = SLP_INSTANCE_TREE (instance);
2362   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2363       && ! vect_slp_analyze_and_verify_node_alignment
2364              (SLP_INSTANCE_TREE (instance)))
2365     return false;
2366
2367   return true;
2368 }
2369
2370
2371 /* Analyze groups of accesses: check that DR belongs to a group of
2372    accesses of legal size, step, etc.  Detect gaps, single element
2373    interleaving, and other special cases. Set grouped access info.
2374    Collect groups of strided stores for further use in SLP analysis.
2375    Worker for vect_analyze_group_access.  */
2376
2377 static bool
2378 vect_analyze_group_access_1 (struct data_reference *dr)
2379 {
2380   tree step = DR_STEP (dr);
2381   tree scalar_type = TREE_TYPE (DR_REF (dr));
2382   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2383   gimple *stmt = DR_STMT (dr);
2384   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2385   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2386   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2387   HOST_WIDE_INT dr_step = -1;
2388   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2389   bool slp_impossible = false;
2390
2391   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2392      size of the interleaving group (including gaps).  */
2393   if (tree_fits_shwi_p (step))
2394     {
2395       dr_step = tree_to_shwi (step);
2396       /* Check that STEP is a multiple of type size.  Otherwise there is
2397          a non-element-sized gap at the end of the group which we
2398          cannot represent in GROUP_GAP or GROUP_SIZE.
2399          ???  As we can handle non-constant step fine here we should
2400          simply remove uses of GROUP_GAP between the last and first
2401          element and instead rely on DR_STEP.  GROUP_SIZE then would
2402          simply not include that gap.  */
2403       if ((dr_step % type_size) != 0)
2404         {
2405           if (dump_enabled_p ())
2406             {
2407               dump_printf_loc (MSG_NOTE, vect_location,
2408                                "Step ");
2409               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2410               dump_printf (MSG_NOTE,
2411                            " is not a multiple of the element size for ");
2412               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2413               dump_printf (MSG_NOTE, "\n");
2414             }
2415           return false;
2416         }
2417       groupsize = absu_hwi (dr_step) / type_size;
2418     }
2419   else
2420     groupsize = 0;
2421
2422   /* Not consecutive access is possible only if it is a part of interleaving.  */
2423   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2424     {
2425       /* Check if it this DR is a part of interleaving, and is a single
2426          element of the group that is accessed in the loop.  */
2427
2428       /* Gaps are supported only for loads. STEP must be a multiple of the type
2429          size.  The size of the group must be a power of 2.  */
2430       if (DR_IS_READ (dr)
2431           && (dr_step % type_size) == 0
2432           && groupsize > 0
2433           && pow2p_hwi (groupsize))
2434         {
2435           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2436           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2437           GROUP_GAP (stmt_info) = groupsize - 1;
2438           if (dump_enabled_p ())
2439             {
2440               dump_printf_loc (MSG_NOTE, vect_location,
2441                                "Detected single element interleaving ");
2442               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2443               dump_printf (MSG_NOTE, " step ");
2444               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2445               dump_printf (MSG_NOTE, "\n");
2446             }
2447
2448           return true;
2449         }
2450
2451       if (dump_enabled_p ())
2452         {
2453           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454                            "not consecutive access ");
2455           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2456         }
2457
2458       if (bb_vinfo)
2459         {
2460           /* Mark the statement as unvectorizable.  */
2461           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2462           return true;
2463         }
2464
2465       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2466       STMT_VINFO_STRIDED_P (stmt_info) = true;
2467       return true;
2468     }
2469
2470   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2471     {
2472       /* First stmt in the interleaving chain. Check the chain.  */
2473       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2474       struct data_reference *data_ref = dr;
2475       unsigned int count = 1;
2476       tree prev_init = DR_INIT (data_ref);
2477       gimple *prev = stmt;
2478       HOST_WIDE_INT diff, gaps = 0;
2479
2480       while (next)
2481         {
2482           /* Skip same data-refs.  In case that two or more stmts share
2483              data-ref (supported only for loads), we vectorize only the first
2484              stmt, and the rest get their vectorized loads from the first
2485              one.  */
2486           if (!tree_int_cst_compare (DR_INIT (data_ref),
2487                                      DR_INIT (STMT_VINFO_DATA_REF (
2488                                                    vinfo_for_stmt (next)))))
2489             {
2490               if (DR_IS_WRITE (data_ref))
2491                 {
2492                   if (dump_enabled_p ())
2493                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494                                      "Two store stmts share the same dr.\n");
2495                   return false;
2496                 }
2497
2498               if (dump_enabled_p ())
2499                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2500                                  "Two or more load stmts share the same dr.\n");
2501
2502               /* For load use the same data-ref load.  */
2503               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2504
2505               prev = next;
2506               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2507               continue;
2508             }
2509
2510           prev = next;
2511           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2512
2513           /* All group members have the same STEP by construction.  */
2514           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2515
2516           /* Check that the distance between two accesses is equal to the type
2517              size. Otherwise, we have gaps.  */
2518           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2519                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2520           if (diff != 1)
2521             {
2522               /* FORNOW: SLP of accesses with gaps is not supported.  */
2523               slp_impossible = true;
2524               if (DR_IS_WRITE (data_ref))
2525                 {
2526                   if (dump_enabled_p ())
2527                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2528                                      "interleaved store with gaps\n");
2529                   return false;
2530                 }
2531
2532               gaps += diff - 1;
2533             }
2534
2535           last_accessed_element += diff;
2536
2537           /* Store the gap from the previous member of the group. If there is no
2538              gap in the access, GROUP_GAP is always 1.  */
2539           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2540
2541           prev_init = DR_INIT (data_ref);
2542           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2543           /* Count the number of data-refs in the chain.  */
2544           count++;
2545         }
2546
2547       if (groupsize == 0)
2548         groupsize = count + gaps;
2549
2550       /* This could be UINT_MAX but as we are generating code in a very
2551          inefficient way we have to cap earlier.  See PR78699 for example.  */
2552       if (groupsize > 4096)
2553         {
2554           if (dump_enabled_p ())
2555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2556                              "group is too large\n");
2557           return false;
2558         }
2559
2560       /* Check that the size of the interleaving is equal to count for stores,
2561          i.e., that there are no gaps.  */
2562       if (groupsize != count
2563           && !DR_IS_READ (dr))
2564         {
2565           if (dump_enabled_p ())
2566             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2567                              "interleaved store with gaps\n");
2568           return false;
2569         }
2570
2571       /* If there is a gap after the last load in the group it is the
2572          difference between the groupsize and the last accessed
2573          element.
2574          When there is no gap, this difference should be 0.  */
2575       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2576
2577       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2578       if (dump_enabled_p ())
2579         {
2580           dump_printf_loc (MSG_NOTE, vect_location,
2581                            "Detected interleaving ");
2582           if (DR_IS_READ (dr))
2583             dump_printf (MSG_NOTE, "load ");
2584           else
2585             dump_printf (MSG_NOTE, "store ");
2586           dump_printf (MSG_NOTE, "of size %u starting with ",
2587                        (unsigned)groupsize);
2588           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2589           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2590             dump_printf_loc (MSG_NOTE, vect_location,
2591                              "There is a gap of %u elements after the group\n",
2592                              GROUP_GAP (vinfo_for_stmt (stmt)));
2593         }
2594
2595       /* SLP: create an SLP data structure for every interleaving group of
2596          stores for further analysis in vect_analyse_slp.  */
2597       if (DR_IS_WRITE (dr) && !slp_impossible)
2598         {
2599           if (loop_vinfo)
2600             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2601           if (bb_vinfo)
2602             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2603         }
2604     }
2605
2606   return true;
2607 }
2608
2609 /* Analyze groups of accesses: check that DR belongs to a group of
2610    accesses of legal size, step, etc.  Detect gaps, single element
2611    interleaving, and other special cases. Set grouped access info.
2612    Collect groups of strided stores for further use in SLP analysis.  */
2613
2614 static bool
2615 vect_analyze_group_access (struct data_reference *dr)
2616 {
2617   if (!vect_analyze_group_access_1 (dr))
2618     {
2619       /* Dissolve the group if present.  */
2620       gimple *next;
2621       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2622       while (stmt)
2623         {
2624           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2625           next = GROUP_NEXT_ELEMENT (vinfo);
2626           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2627           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2628           stmt = next;
2629         }
2630       return false;
2631     }
2632   return true;
2633 }
2634
2635 /* Analyze the access pattern of the data-reference DR.
2636    In case of non-consecutive accesses call vect_analyze_group_access() to
2637    analyze groups of accesses.  */
2638
2639 static bool
2640 vect_analyze_data_ref_access (struct data_reference *dr)
2641 {
2642   tree step = DR_STEP (dr);
2643   tree scalar_type = TREE_TYPE (DR_REF (dr));
2644   gimple *stmt = DR_STMT (dr);
2645   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2646   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2647   struct loop *loop = NULL;
2648
2649   if (loop_vinfo)
2650     loop = LOOP_VINFO_LOOP (loop_vinfo);
2651
2652   if (loop_vinfo && !step)
2653     {
2654       if (dump_enabled_p ())
2655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2656                          "bad data-ref access in loop\n");
2657       return false;
2658     }
2659
2660   /* Allow loads with zero step in inner-loop vectorization.  */
2661   if (loop_vinfo && integer_zerop (step))
2662     {
2663       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2664       if (!nested_in_vect_loop_p (loop, stmt))
2665         return DR_IS_READ (dr);
2666       /* Allow references with zero step for outer loops marked
2667          with pragma omp simd only - it guarantees absence of
2668          loop-carried dependencies between inner loop iterations.  */
2669       if (!loop->force_vectorize)
2670         {
2671           if (dump_enabled_p ())
2672             dump_printf_loc (MSG_NOTE, vect_location,
2673                              "zero step in inner loop of nest\n");
2674           return false;
2675         }
2676     }
2677
2678   if (loop && nested_in_vect_loop_p (loop, stmt))
2679     {
2680       /* Interleaved accesses are not yet supported within outer-loop
2681         vectorization for references in the inner-loop.  */
2682       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2683
2684       /* For the rest of the analysis we use the outer-loop step.  */
2685       step = STMT_VINFO_DR_STEP (stmt_info);
2686       if (integer_zerop (step))
2687         {
2688           if (dump_enabled_p ())
2689             dump_printf_loc (MSG_NOTE, vect_location,
2690                              "zero step in outer loop.\n");
2691           return DR_IS_READ (dr);
2692         }
2693     }
2694
2695   /* Consecutive?  */
2696   if (TREE_CODE (step) == INTEGER_CST)
2697     {
2698       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2699       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2700           || (dr_step < 0
2701               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2702         {
2703           /* Mark that it is not interleaving.  */
2704           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2705           return true;
2706         }
2707     }
2708
2709   if (loop && nested_in_vect_loop_p (loop, stmt))
2710     {
2711       if (dump_enabled_p ())
2712         dump_printf_loc (MSG_NOTE, vect_location,
2713                          "grouped access in outer loop.\n");
2714       return false;
2715     }
2716
2717
2718   /* Assume this is a DR handled by non-constant strided load case.  */
2719   if (TREE_CODE (step) != INTEGER_CST)
2720     return (STMT_VINFO_STRIDED_P (stmt_info)
2721             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2722                 || vect_analyze_group_access (dr)));
2723
2724   /* Not consecutive access - check if it's a part of interleaving group.  */
2725   return vect_analyze_group_access (dr);
2726 }
2727
2728 /* Compare two data-references DRA and DRB to group them into chunks
2729    suitable for grouping.  */
2730
2731 static int
2732 dr_group_sort_cmp (const void *dra_, const void *drb_)
2733 {
2734   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2735   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2736   int cmp;
2737
2738   /* Stabilize sort.  */
2739   if (dra == drb)
2740     return 0;
2741
2742   /* DRs in different loops never belong to the same group.  */
2743   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2744   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2745   if (loopa != loopb)
2746     return loopa->num < loopb->num ? -1 : 1;
2747
2748   /* Ordering of DRs according to base.  */
2749   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2750                                DR_BASE_ADDRESS (drb));
2751   if (cmp != 0)
2752     return cmp;
2753
2754   /* And according to DR_OFFSET.  */
2755   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2756   if (cmp != 0)
2757     return cmp;
2758
2759   /* Put reads before writes.  */
2760   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2761     return DR_IS_READ (dra) ? -1 : 1;
2762
2763   /* Then sort after access size.  */
2764   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2765                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2766   if (cmp != 0)
2767     return cmp;
2768
2769   /* And after step.  */
2770   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2771   if (cmp != 0)
2772     return cmp;
2773
2774   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2775   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2776   if (cmp == 0)
2777     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2778   return cmp;
2779 }
2780
2781 /* Function vect_analyze_data_ref_accesses.
2782
2783    Analyze the access pattern of all the data references in the loop.
2784
2785    FORNOW: the only access pattern that is considered vectorizable is a
2786            simple step 1 (consecutive) access.
2787
2788    FORNOW: handle only arrays and pointer accesses.  */
2789
2790 bool
2791 vect_analyze_data_ref_accesses (vec_info *vinfo)
2792 {
2793   unsigned int i;
2794   vec<data_reference_p> datarefs = vinfo->datarefs;
2795   struct data_reference *dr;
2796
2797   if (dump_enabled_p ())
2798     dump_printf_loc (MSG_NOTE, vect_location,
2799                      "=== vect_analyze_data_ref_accesses ===\n");
2800
2801   if (datarefs.is_empty ())
2802     return true;
2803
2804   /* Sort the array of datarefs to make building the interleaving chains
2805      linear.  Don't modify the original vector's order, it is needed for
2806      determining what dependencies are reversed.  */
2807   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2808   datarefs_copy.qsort (dr_group_sort_cmp);
2809
2810   /* Build the interleaving chains.  */
2811   for (i = 0; i < datarefs_copy.length () - 1;)
2812     {
2813       data_reference_p dra = datarefs_copy[i];
2814       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2815       stmt_vec_info lastinfo = NULL;
2816       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2817         {
2818           ++i;
2819           continue;
2820         }
2821       for (i = i + 1; i < datarefs_copy.length (); ++i)
2822         {
2823           data_reference_p drb = datarefs_copy[i];
2824           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2825           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2826             break;
2827
2828           /* ???  Imperfect sorting (non-compatible types, non-modulo
2829              accesses, same accesses) can lead to a group to be artificially
2830              split here as we don't just skip over those.  If it really
2831              matters we can push those to a worklist and re-iterate
2832              over them.  The we can just skip ahead to the next DR here.  */
2833
2834           /* DRs in a different loop should not be put into the same
2835              interleaving group.  */
2836           if (gimple_bb (DR_STMT (dra))->loop_father
2837               != gimple_bb (DR_STMT (drb))->loop_father)
2838             break;
2839
2840           /* Check that the data-refs have same first location (except init)
2841              and they are both either store or load (not load and store,
2842              not masked loads or stores).  */
2843           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2844               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2845                                         DR_BASE_ADDRESS (drb)) != 0
2846               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2847               || !gimple_assign_single_p (DR_STMT (dra))
2848               || !gimple_assign_single_p (DR_STMT (drb)))
2849             break;
2850
2851           /* Check that the data-refs have the same constant size.  */
2852           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2853           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2854           if (!tree_fits_uhwi_p (sza)
2855               || !tree_fits_uhwi_p (szb)
2856               || !tree_int_cst_equal (sza, szb))
2857             break;
2858
2859           /* Check that the data-refs have the same step.  */
2860           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2861             break;
2862
2863           /* Check the types are compatible.
2864              ???  We don't distinguish this during sorting.  */
2865           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2866                                    TREE_TYPE (DR_REF (drb))))
2867             break;
2868
2869           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2870           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2871           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2872           HOST_WIDE_INT init_prev
2873             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2874           gcc_assert (init_a <= init_b
2875                       && init_a <= init_prev
2876                       && init_prev <= init_b);
2877
2878           /* Do not place the same access in the interleaving chain twice.  */
2879           if (init_b == init_prev)
2880             {
2881               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2882                           < gimple_uid (DR_STMT (drb)));
2883               /* ???  For now we simply "drop" the later reference which is
2884                  otherwise the same rather than finishing off this group.
2885                  In the end we'd want to re-process duplicates forming
2886                  multiple groups from the refs, likely by just collecting
2887                  all candidates (including duplicates and split points
2888                  below) in a vector and then process them together.  */
2889               continue;
2890             }
2891
2892           /* If init_b == init_a + the size of the type * k, we have an
2893              interleaving, and DRA is accessed before DRB.  */
2894           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2895           if (type_size_a == 0
2896               || (init_b - init_a) % type_size_a != 0)
2897             break;
2898
2899           /* If we have a store, the accesses are adjacent.  This splits
2900              groups into chunks we support (we don't support vectorization
2901              of stores with gaps).  */
2902           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2903             break;
2904
2905           /* If the step (if not zero or non-constant) is greater than the
2906              difference between data-refs' inits this splits groups into
2907              suitable sizes.  */
2908           if (tree_fits_shwi_p (DR_STEP (dra)))
2909             {
2910               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2911               if (step != 0 && step <= (init_b - init_a))
2912                 break;
2913             }
2914
2915           if (dump_enabled_p ())
2916             {
2917               dump_printf_loc (MSG_NOTE, vect_location,
2918                                "Detected interleaving ");
2919               if (DR_IS_READ (dra))
2920                 dump_printf (MSG_NOTE, "load ");
2921               else
2922                 dump_printf (MSG_NOTE, "store ");
2923               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2924               dump_printf (MSG_NOTE,  " and ");
2925               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2926               dump_printf (MSG_NOTE, "\n");
2927             }
2928
2929           /* Link the found element into the group list.  */
2930           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2931             {
2932               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2933               lastinfo = stmtinfo_a;
2934             }
2935           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2936           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2937           lastinfo = stmtinfo_b;
2938         }
2939     }
2940
2941   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2942     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2943         && !vect_analyze_data_ref_access (dr))
2944       {
2945         if (dump_enabled_p ())
2946           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2947                            "not vectorized: complicated access pattern.\n");
2948
2949         if (is_a <bb_vec_info> (vinfo))
2950           {
2951             /* Mark the statement as not vectorizable.  */
2952             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2953             continue;
2954           }
2955         else
2956           {
2957             datarefs_copy.release ();
2958             return false;
2959           }
2960       }
2961
2962   datarefs_copy.release ();
2963   return true;
2964 }
2965
2966 /* Function vect_vfa_segment_size.
2967
2968    Create an expression that computes the size of segment
2969    that will be accessed for a data reference.  The functions takes into
2970    account that realignment loads may access one more vector.
2971
2972    Input:
2973      DR: The data reference.
2974      LENGTH_FACTOR: segment length to consider.
2975
2976    Return an expression whose value is the size of segment which will be
2977    accessed by DR.  */
2978
2979 static tree
2980 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2981 {
2982   tree segment_length;
2983
2984   if (integer_zerop (DR_STEP (dr)))
2985     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2986   else
2987     segment_length = size_binop (MULT_EXPR,
2988                                  fold_convert (sizetype, DR_STEP (dr)),
2989                                  fold_convert (sizetype, length_factor));
2990
2991   if (vect_supportable_dr_alignment (dr, false)
2992         == dr_explicit_realign_optimized)
2993     {
2994       tree vector_size = TYPE_SIZE_UNIT
2995                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2996
2997       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2998     }
2999   return segment_length;
3000 }
3001
3002 /* Function vect_no_alias_p.
3003
3004    Given data references A and B with equal base and offset, see whether
3005    the alias relation can be decided at compilation time.  Return 1 if
3006    it can and the references alias, 0 if it can and the references do
3007    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A
3008    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3009    respectively.  */
3010
3011 static int
3012 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3013                          tree segment_length_a, tree segment_length_b)
3014 {
3015   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3016   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3017   poly_uint64 const_length_a;
3018   poly_uint64 const_length_b;
3019
3020   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3021      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3022      [a, a+12) */
3023   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3024     {
3025       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3026       offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3027     }
3028   else
3029     const_length_a = tree_to_poly_uint64 (segment_length_a);
3030   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3031     {
3032       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3033       offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3034     }
3035   else
3036     const_length_b = tree_to_poly_uint64 (segment_length_b);
3037
3038   if (ranges_known_overlap_p (offset_a, const_length_a,
3039                               offset_b, const_length_b))
3040     return 1;
3041
3042   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3043                                offset_b, const_length_b))
3044     return 0;
3045
3046   return -1;
3047 }
3048
3049 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3050    in DDR is >= VF.  */
3051
3052 static bool
3053 dependence_distance_ge_vf (data_dependence_relation *ddr,
3054                            unsigned int loop_depth, poly_uint64 vf)
3055 {
3056   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3057       || DDR_NUM_DIST_VECTS (ddr) == 0)
3058     return false;
3059
3060   /* If the dependence is exact, we should have limited the VF instead.  */
3061   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3062
3063   unsigned int i;
3064   lambda_vector dist_v;
3065   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3066     {
3067       HOST_WIDE_INT dist = dist_v[loop_depth];
3068       if (dist != 0
3069           && !(dist > 0 && DDR_REVERSED_P (ddr))
3070           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3071         return false;
3072     }
3073
3074   if (dump_enabled_p ())
3075     {
3076       dump_printf_loc (MSG_NOTE, vect_location,
3077                        "dependence distance between ");
3078       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3079       dump_printf (MSG_NOTE,  " and ");
3080       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3081       dump_printf (MSG_NOTE,  " is >= VF\n");
3082     }
3083
3084   return true;
3085 }
3086
3087 /* Function vect_prune_runtime_alias_test_list.
3088
3089    Prune a list of ddrs to be tested at run-time by versioning for alias.
3090    Merge several alias checks into one if possible.
3091    Return FALSE if resulting list of ddrs is longer then allowed by
3092    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3093
3094 bool
3095 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3096 {
3097   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3098   hash_set <tree_pair_hash> compared_objects;
3099
3100   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3101   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3102     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3103   vec<vec_object_pair> &check_unequal_addrs
3104     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3105   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3106   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3107
3108   ddr_p ddr;
3109   unsigned int i;
3110   tree length_factor;
3111
3112   if (dump_enabled_p ())
3113     dump_printf_loc (MSG_NOTE, vect_location,
3114                      "=== vect_prune_runtime_alias_test_list ===\n");
3115
3116   if (may_alias_ddrs.is_empty ())
3117     return true;
3118
3119   comp_alias_ddrs.create (may_alias_ddrs.length ());
3120
3121   unsigned int loop_depth
3122     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3123                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3124
3125   /* First, we collect all data ref pairs for aliasing checks.  */
3126   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3127     {
3128       int comp_res;
3129       struct data_reference *dr_a, *dr_b;
3130       gimple *dr_group_first_a, *dr_group_first_b;
3131       tree segment_length_a, segment_length_b;
3132       gimple *stmt_a, *stmt_b;
3133
3134       /* Ignore the alias if the VF we chose ended up being no greater
3135          than the dependence distance.  */
3136       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3137         continue;
3138
3139       if (DDR_OBJECT_A (ddr))
3140         {
3141           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3142           if (!compared_objects.add (new_pair))
3143             {
3144               if (dump_enabled_p ())
3145                 {
3146                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3147                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3148                   dump_printf (MSG_NOTE, " and ");
3149                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3150                   dump_printf (MSG_NOTE, " have different addresses\n");
3151                 }
3152               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3153             }
3154           continue;
3155         }
3156
3157       dr_a = DDR_A (ddr);
3158       stmt_a = DR_STMT (DDR_A (ddr));
3159       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3160       if (dr_group_first_a)
3161         {
3162           stmt_a = dr_group_first_a;
3163           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3164         }
3165
3166       dr_b = DDR_B (ddr);
3167       stmt_b = DR_STMT (DDR_B (ddr));
3168       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3169       if (dr_group_first_b)
3170         {
3171           stmt_b = dr_group_first_b;
3172           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3173         }
3174
3175       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3176         length_factor = scalar_loop_iters;
3177       else
3178         length_factor = size_int (vect_factor);
3179       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3180       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3181
3182       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3183                                         DR_BASE_ADDRESS (dr_b));
3184       if (comp_res == 0)
3185         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3186                                           DR_OFFSET (dr_b));
3187
3188       /* See whether the alias is known at compilation time.  */
3189       if (comp_res == 0
3190           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3191           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3192           && poly_int_tree_p (segment_length_a)
3193           && poly_int_tree_p (segment_length_b))
3194         {
3195           int res = vect_compile_time_alias (dr_a, dr_b,
3196                                              segment_length_a,
3197                                              segment_length_b);
3198           if (res == 0)
3199             continue;
3200
3201           if (res == 1)
3202             {
3203               if (dump_enabled_p ())
3204                 dump_printf_loc (MSG_NOTE, vect_location,
3205                                  "not vectorized: compilation time alias.\n");
3206               return false;
3207             }
3208         }
3209
3210       dr_with_seg_len_pair_t dr_with_seg_len_pair
3211           (dr_with_seg_len (dr_a, segment_length_a),
3212            dr_with_seg_len (dr_b, segment_length_b));
3213
3214       /* Canonicalize pairs by sorting the two DR members.  */
3215       if (comp_res > 0)
3216         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3217
3218       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3219     }
3220
3221   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3222
3223   unsigned int count = (comp_alias_ddrs.length ()
3224                         + check_unequal_addrs.length ());
3225   dump_printf_loc (MSG_NOTE, vect_location,
3226                    "improved number of alias checks from %d to %d\n",
3227                    may_alias_ddrs.length (), count);
3228   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3229     {
3230       if (dump_enabled_p ())
3231         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3232                          "number of versioning for alias "
3233                          "run-time tests exceeds %d "
3234                          "(--param vect-max-version-for-alias-checks)\n",
3235                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3236       return false;
3237     }
3238
3239   return true;
3240 }
3241
3242 /* Return true if a non-affine read or write in STMT is suitable for a
3243    gather load or scatter store.  Describe the operation in *INFO if so.  */
3244
3245 bool
3246 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3247                            gather_scatter_info *info)
3248 {
3249   HOST_WIDE_INT scale = 1;
3250   poly_int64 pbitpos, pbitsize;
3251   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3252   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3253   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3254   tree offtype = NULL_TREE;
3255   tree decl, base, off;
3256   machine_mode pmode;
3257   int punsignedp, reversep, pvolatilep = 0;
3258
3259   base = DR_REF (dr);
3260   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3261      see if we can use the def stmt of the address.  */
3262   if (is_gimple_call (stmt)
3263       && gimple_call_internal_p (stmt)
3264       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3265           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3266       && TREE_CODE (base) == MEM_REF
3267       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3268       && integer_zerop (TREE_OPERAND (base, 1))
3269       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3270     {
3271       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3272       if (is_gimple_assign (def_stmt)
3273           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3274         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3275     }
3276
3277   /* The gather and scatter builtins need address of the form
3278      loop_invariant + vector * {1, 2, 4, 8}
3279      or
3280      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3281      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3282      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3283      multiplications and additions in it.  To get a vector, we need
3284      a single SSA_NAME that will be defined in the loop and will
3285      contain everything that is not loop invariant and that can be
3286      vectorized.  The following code attempts to find such a preexistng
3287      SSA_NAME OFF and put the loop invariants into a tree BASE
3288      that can be gimplified before the loop.  */
3289   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3290                               &punsignedp, &reversep, &pvolatilep);
3291   gcc_assert (base && !reversep);
3292   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3293
3294   if (TREE_CODE (base) == MEM_REF)
3295     {
3296       if (!integer_zerop (TREE_OPERAND (base, 1)))
3297         {
3298           if (off == NULL_TREE)
3299             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3300           else
3301             off = size_binop (PLUS_EXPR, off,
3302                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3303         }
3304       base = TREE_OPERAND (base, 0);
3305     }
3306   else
3307     base = build_fold_addr_expr (base);
3308
3309   if (off == NULL_TREE)
3310     off = size_zero_node;
3311
3312   /* If base is not loop invariant, either off is 0, then we start with just
3313      the constant offset in the loop invariant BASE and continue with base
3314      as OFF, otherwise give up.
3315      We could handle that case by gimplifying the addition of base + off
3316      into some SSA_NAME and use that as off, but for now punt.  */
3317   if (!expr_invariant_in_loop_p (loop, base))
3318     {
3319       if (!integer_zerop (off))
3320         return false;
3321       off = base;
3322       base = size_int (pbytepos);
3323     }
3324   /* Otherwise put base + constant offset into the loop invariant BASE
3325      and continue with OFF.  */
3326   else
3327     {
3328       base = fold_convert (sizetype, base);
3329       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3330     }
3331
3332   /* OFF at this point may be either a SSA_NAME or some tree expression
3333      from get_inner_reference.  Try to peel off loop invariants from it
3334      into BASE as long as possible.  */
3335   STRIP_NOPS (off);
3336   while (offtype == NULL_TREE)
3337     {
3338       enum tree_code code;
3339       tree op0, op1, add = NULL_TREE;
3340
3341       if (TREE_CODE (off) == SSA_NAME)
3342         {
3343           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3344
3345           if (expr_invariant_in_loop_p (loop, off))
3346             return false;
3347
3348           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3349             break;
3350
3351           op0 = gimple_assign_rhs1 (def_stmt);
3352           code = gimple_assign_rhs_code (def_stmt);
3353           op1 = gimple_assign_rhs2 (def_stmt);
3354         }
3355       else
3356         {
3357           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3358             return false;
3359           code = TREE_CODE (off);
3360           extract_ops_from_tree (off, &code, &op0, &op1);
3361         }
3362       switch (code)
3363         {
3364         case POINTER_PLUS_EXPR:
3365         case PLUS_EXPR:
3366           if (expr_invariant_in_loop_p (loop, op0))
3367             {
3368               add = op0;
3369               off = op1;
3370             do_add:
3371               add = fold_convert (sizetype, add);
3372               if (scale != 1)
3373                 add = size_binop (MULT_EXPR, add, size_int (scale));
3374               base = size_binop (PLUS_EXPR, base, add);
3375               continue;
3376             }
3377           if (expr_invariant_in_loop_p (loop, op1))
3378             {
3379               add = op1;
3380               off = op0;
3381               goto do_add;
3382             }
3383           break;
3384         case MINUS_EXPR:
3385           if (expr_invariant_in_loop_p (loop, op1))
3386             {
3387               add = fold_convert (sizetype, op1);
3388               add = size_binop (MINUS_EXPR, size_zero_node, add);
3389               off = op0;
3390               goto do_add;
3391             }
3392           break;
3393         case MULT_EXPR:
3394           if (scale == 1 && tree_fits_shwi_p (op1))
3395             {
3396               scale = tree_to_shwi (op1);
3397               off = op0;
3398               continue;
3399             }
3400           break;
3401         case SSA_NAME:
3402           off = op0;
3403           continue;
3404         CASE_CONVERT:
3405           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3406               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3407             break;
3408           if (TYPE_PRECISION (TREE_TYPE (op0))
3409               == TYPE_PRECISION (TREE_TYPE (off)))
3410             {
3411               off = op0;
3412               continue;
3413             }
3414           if (TYPE_PRECISION (TREE_TYPE (op0))
3415               < TYPE_PRECISION (TREE_TYPE (off)))
3416             {
3417               off = op0;
3418               offtype = TREE_TYPE (off);
3419               STRIP_NOPS (off);
3420               continue;
3421             }
3422           break;
3423         default:
3424           break;
3425         }
3426       break;
3427     }
3428
3429   /* If at the end OFF still isn't a SSA_NAME or isn't
3430      defined in the loop, punt.  */
3431   if (TREE_CODE (off) != SSA_NAME
3432       || expr_invariant_in_loop_p (loop, off))
3433     return false;
3434
3435   if (offtype == NULL_TREE)
3436     offtype = TREE_TYPE (off);
3437
3438   if (DR_IS_READ (dr))
3439     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3440                                              offtype, scale);
3441   else
3442     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3443                                               offtype, scale);
3444
3445   if (decl == NULL_TREE)
3446     return false;
3447
3448   info->decl = decl;
3449   info->base = base;
3450   info->offset = off;
3451   info->offset_dt = vect_unknown_def_type;
3452   info->offset_vectype = NULL_TREE;
3453   info->scale = scale;
3454   return true;
3455 }
3456
3457 /* Function vect_analyze_data_refs.
3458
3459   Find all the data references in the loop or basic block.
3460
3461    The general structure of the analysis of data refs in the vectorizer is as
3462    follows:
3463    1- vect_analyze_data_refs(loop/bb): call
3464       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3465       in the loop/bb and their dependences.
3466    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3467    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3468    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3469
3470 */
3471
3472 bool
3473 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3474 {
3475   struct loop *loop = NULL;
3476   unsigned int i;
3477   struct data_reference *dr;
3478   tree scalar_type;
3479
3480   if (dump_enabled_p ())
3481     dump_printf_loc (MSG_NOTE, vect_location,
3482                      "=== vect_analyze_data_refs ===\n");
3483
3484   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3485     loop = LOOP_VINFO_LOOP (loop_vinfo);
3486
3487   /* Go through the data-refs, check that the analysis succeeded.  Update
3488      pointer from stmt_vec_info struct to DR and vectype.  */
3489
3490   vec<data_reference_p> datarefs = vinfo->datarefs;
3491   FOR_EACH_VEC_ELT (datarefs, i, dr)
3492     {
3493       gimple *stmt;
3494       stmt_vec_info stmt_info;
3495       tree base, offset, init;
3496       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3497       bool simd_lane_access = false;
3498       poly_uint64 vf;
3499
3500 again:
3501       if (!dr || !DR_REF (dr))
3502         {
3503           if (dump_enabled_p ())
3504             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3505                              "not vectorized: unhandled data-ref\n");
3506           return false;
3507         }
3508
3509       stmt = DR_STMT (dr);
3510       stmt_info = vinfo_for_stmt (stmt);
3511
3512       /* Discard clobbers from the dataref vector.  We will remove
3513          clobber stmts during vectorization.  */
3514       if (gimple_clobber_p (stmt))
3515         {
3516           free_data_ref (dr);
3517           if (i == datarefs.length () - 1)
3518             {
3519               datarefs.pop ();
3520               break;
3521             }
3522           datarefs.ordered_remove (i);
3523           dr = datarefs[i];
3524           goto again;
3525         }
3526
3527       /* Check that analysis of the data-ref succeeded.  */
3528       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3529           || !DR_STEP (dr))
3530         {
3531           bool maybe_gather
3532             = DR_IS_READ (dr)
3533               && !TREE_THIS_VOLATILE (DR_REF (dr))
3534               && targetm.vectorize.builtin_gather != NULL;
3535           bool maybe_scatter
3536             = DR_IS_WRITE (dr)
3537               && !TREE_THIS_VOLATILE (DR_REF (dr))
3538               && targetm.vectorize.builtin_scatter != NULL;
3539           bool maybe_simd_lane_access
3540             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3541
3542           /* If target supports vector gather loads or scatter stores, or if
3543              this might be a SIMD lane access, see if they can't be used.  */
3544           if (is_a <loop_vec_info> (vinfo)
3545               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3546               && !nested_in_vect_loop_p (loop, stmt))
3547             {
3548               struct data_reference *newdr
3549                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3550                                    DR_REF (dr), stmt, !maybe_scatter,
3551                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3552               gcc_assert (newdr != NULL && DR_REF (newdr));
3553               if (DR_BASE_ADDRESS (newdr)
3554                   && DR_OFFSET (newdr)
3555                   && DR_INIT (newdr)
3556                   && DR_STEP (newdr)
3557                   && integer_zerop (DR_STEP (newdr)))
3558                 {
3559                   if (maybe_simd_lane_access)
3560                     {
3561                       tree off = DR_OFFSET (newdr);
3562                       STRIP_NOPS (off);
3563                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3564                           && TREE_CODE (off) == MULT_EXPR
3565                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3566                         {
3567                           tree step = TREE_OPERAND (off, 1);
3568                           off = TREE_OPERAND (off, 0);
3569                           STRIP_NOPS (off);
3570                           if (CONVERT_EXPR_P (off)
3571                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3572                                                                           0)))
3573                                  < TYPE_PRECISION (TREE_TYPE (off)))
3574                             off = TREE_OPERAND (off, 0);
3575                           if (TREE_CODE (off) == SSA_NAME)
3576                             {
3577                               gimple *def = SSA_NAME_DEF_STMT (off);
3578                               tree reft = TREE_TYPE (DR_REF (newdr));
3579                               if (is_gimple_call (def)
3580                                   && gimple_call_internal_p (def)
3581                                   && (gimple_call_internal_fn (def)
3582                                       == IFN_GOMP_SIMD_LANE))
3583                                 {
3584                                   tree arg = gimple_call_arg (def, 0);
3585                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3586                                   arg = SSA_NAME_VAR (arg);
3587                                   if (arg == loop->simduid
3588                                       /* For now.  */
3589                                       && tree_int_cst_equal
3590                                            (TYPE_SIZE_UNIT (reft),
3591                                             step))
3592                                     {
3593                                       DR_OFFSET (newdr) = ssize_int (0);
3594                                       DR_STEP (newdr) = step;
3595                                       DR_OFFSET_ALIGNMENT (newdr)
3596                                         = BIGGEST_ALIGNMENT;
3597                                       DR_STEP_ALIGNMENT (newdr)
3598                                         = highest_pow2_factor (step);
3599                                       dr = newdr;
3600                                       simd_lane_access = true;
3601                                     }
3602                                 }
3603                             }
3604                         }
3605                     }
3606                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3607                     {
3608                       dr = newdr;
3609                       if (maybe_gather)
3610                         gatherscatter = GATHER;
3611                       else
3612                         gatherscatter = SCATTER;
3613                     }
3614                 }
3615               if (gatherscatter == SG_NONE && !simd_lane_access)
3616                 free_data_ref (newdr);
3617             }
3618
3619           if (gatherscatter == SG_NONE && !simd_lane_access)
3620             {
3621               if (dump_enabled_p ())
3622                 {
3623                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3624                                    "not vectorized: data ref analysis "
3625                                    "failed ");
3626                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3627                 }
3628
3629               if (is_a <bb_vec_info> (vinfo))
3630                 break;
3631
3632               return false;
3633             }
3634         }
3635
3636       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3637         {
3638           if (dump_enabled_p ())
3639             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3640                              "not vectorized: base addr of dr is a "
3641                              "constant\n");
3642
3643           if (is_a <bb_vec_info> (vinfo))
3644             break;
3645
3646           if (gatherscatter != SG_NONE || simd_lane_access)
3647             free_data_ref (dr);
3648           return false;
3649         }
3650
3651       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3652         {
3653           if (dump_enabled_p ())
3654             {
3655               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3656                                "not vectorized: volatile type ");
3657               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658             }
3659
3660           if (is_a <bb_vec_info> (vinfo))
3661             break;
3662
3663           return false;
3664         }
3665
3666       if (stmt_can_throw_internal (stmt))
3667         {
3668           if (dump_enabled_p ())
3669             {
3670               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3671                                "not vectorized: statement can throw an "
3672                                "exception ");
3673               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3674             }
3675
3676           if (is_a <bb_vec_info> (vinfo))
3677             break;
3678
3679           if (gatherscatter != SG_NONE || simd_lane_access)
3680             free_data_ref (dr);
3681           return false;
3682         }
3683
3684       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3685           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3686         {
3687           if (dump_enabled_p ())
3688             {
3689               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3690                                "not vectorized: statement is bitfield "
3691                                "access ");
3692               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3693             }
3694
3695           if (is_a <bb_vec_info> (vinfo))
3696             break;
3697
3698           if (gatherscatter != SG_NONE || simd_lane_access)
3699             free_data_ref (dr);
3700           return false;
3701         }
3702
3703       base = unshare_expr (DR_BASE_ADDRESS (dr));
3704       offset = unshare_expr (DR_OFFSET (dr));
3705       init = unshare_expr (DR_INIT (dr));
3706
3707       if (is_gimple_call (stmt)
3708           && (!gimple_call_internal_p (stmt)
3709               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3710                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3711         {
3712           if (dump_enabled_p ())
3713             {
3714               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3715                                "not vectorized: dr in a call ");
3716               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3717             }
3718
3719           if (is_a <bb_vec_info> (vinfo))
3720             break;
3721
3722           if (gatherscatter != SG_NONE || simd_lane_access)
3723             free_data_ref (dr);
3724           return false;
3725         }
3726
3727       /* Update DR field in stmt_vec_info struct.  */
3728
3729       /* If the dataref is in an inner-loop of the loop that is considered for
3730          for vectorization, we also want to analyze the access relative to
3731          the outer-loop (DR contains information only relative to the
3732          inner-most enclosing loop).  We do that by building a reference to the
3733          first location accessed by the inner-loop, and analyze it relative to
3734          the outer-loop.  */
3735       if (loop && nested_in_vect_loop_p (loop, stmt))
3736         {
3737           /* Build a reference to the first location accessed by the
3738              inner loop: *(BASE + INIT + OFFSET).  By construction,
3739              this address must be invariant in the inner loop, so we
3740              can consider it as being used in the outer loop.  */
3741           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3742                                           init, offset);
3743           tree init_addr = fold_build_pointer_plus (base, init_offset);
3744           tree init_ref = build_fold_indirect_ref (init_addr);
3745
3746           if (dump_enabled_p ())
3747             {
3748               dump_printf_loc (MSG_NOTE, vect_location,
3749                                "analyze in outer loop: ");
3750               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3751               dump_printf (MSG_NOTE, "\n");
3752             }
3753
3754           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3755                                      init_ref, loop))
3756             /* dr_analyze_innermost already explained the failure.  */
3757             return false;
3758
3759           if (dump_enabled_p ())
3760             {
3761               dump_printf_loc (MSG_NOTE, vect_location,
3762                                "\touter base_address: ");
3763               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3764                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3765               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3766               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3767                                  STMT_VINFO_DR_OFFSET (stmt_info));
3768               dump_printf (MSG_NOTE,
3769                            "\n\touter constant offset from base address: ");
3770               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3771                                  STMT_VINFO_DR_INIT (stmt_info));
3772               dump_printf (MSG_NOTE, "\n\touter step: ");
3773               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3774                                  STMT_VINFO_DR_STEP (stmt_info));
3775               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3776                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3777               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3778                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3779               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3780                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3781               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3782                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3783             }
3784         }
3785
3786       if (STMT_VINFO_DATA_REF (stmt_info))
3787         {
3788           if (dump_enabled_p ())
3789             {
3790               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791                                "not vectorized: more than one data ref "
3792                                "in stmt: ");
3793               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3794             }
3795
3796           if (is_a <bb_vec_info> (vinfo))
3797             break;
3798
3799           if (gatherscatter != SG_NONE || simd_lane_access)
3800             free_data_ref (dr);
3801           return false;
3802         }
3803
3804       STMT_VINFO_DATA_REF (stmt_info) = dr;
3805       if (simd_lane_access)
3806         {
3807           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3808           free_data_ref (datarefs[i]);
3809           datarefs[i] = dr;
3810         }
3811
3812       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3813           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3814           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3815         {
3816           if (dump_enabled_p ())
3817             {
3818               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3819                                "not vectorized: base object not addressable "
3820                                "for stmt: ");
3821               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3822             }
3823           if (is_a <bb_vec_info> (vinfo))
3824             {
3825               /* In BB vectorization the ref can still participate
3826                  in dependence analysis, we just can't vectorize it.  */
3827               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3828               continue;
3829             }
3830           return false;
3831         }
3832
3833       /* Set vectype for STMT.  */
3834       scalar_type = TREE_TYPE (DR_REF (dr));
3835       STMT_VINFO_VECTYPE (stmt_info)
3836         = get_vectype_for_scalar_type (scalar_type);
3837       if (!STMT_VINFO_VECTYPE (stmt_info))
3838         {
3839           if (dump_enabled_p ())
3840             {
3841               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3842                                "not vectorized: no vectype for stmt: ");
3843               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3844               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3845               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3846                                  scalar_type);
3847               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3848             }
3849
3850           if (is_a <bb_vec_info> (vinfo))
3851             {
3852               /* No vector type is fine, the ref can still participate
3853                  in dependence analysis, we just can't vectorize it.  */
3854               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3855               continue;
3856             }
3857
3858           if (gatherscatter != SG_NONE || simd_lane_access)
3859             {
3860               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3861               if (gatherscatter != SG_NONE)
3862                 free_data_ref (dr);
3863             }
3864           return false;
3865         }
3866       else
3867         {
3868           if (dump_enabled_p ())
3869             {
3870               dump_printf_loc (MSG_NOTE, vect_location,
3871                                "got vectype for stmt: ");
3872               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3873               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3874                                  STMT_VINFO_VECTYPE (stmt_info));
3875               dump_printf (MSG_NOTE, "\n");
3876             }
3877         }
3878
3879       /* Adjust the minimal vectorization factor according to the
3880          vector type.  */
3881       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3882       *min_vf = upper_bound (*min_vf, vf);
3883
3884       if (gatherscatter != SG_NONE)
3885         {
3886           gather_scatter_info gs_info;
3887           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3888                                           &gs_info)
3889               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3890             {
3891               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3892               free_data_ref (dr);
3893               if (dump_enabled_p ())
3894                 {
3895                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3896                                    (gatherscatter == GATHER) ?
3897                                    "not vectorized: not suitable for gather "
3898                                    "load " :
3899                                    "not vectorized: not suitable for scatter "
3900                                    "store ");
3901                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3902                 }
3903               return false;
3904             }
3905
3906           free_data_ref (datarefs[i]);
3907           datarefs[i] = dr;
3908           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3909         }
3910
3911       else if (is_a <loop_vec_info> (vinfo)
3912                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3913         {
3914           if (nested_in_vect_loop_p (loop, stmt))
3915             {
3916               if (dump_enabled_p ())
3917                 {
3918                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3919                                    "not vectorized: not suitable for strided "
3920                                    "load ");
3921                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3922                 }
3923               return false;
3924             }
3925           STMT_VINFO_STRIDED_P (stmt_info) = true;
3926         }
3927     }
3928
3929   /* If we stopped analysis at the first dataref we could not analyze
3930      when trying to vectorize a basic-block mark the rest of the datarefs
3931      as not vectorizable and truncate the vector of datarefs.  That
3932      avoids spending useless time in analyzing their dependence.  */
3933   if (i != datarefs.length ())
3934     {
3935       gcc_assert (is_a <bb_vec_info> (vinfo));
3936       for (unsigned j = i; j < datarefs.length (); ++j)
3937         {
3938           data_reference_p dr = datarefs[j];
3939           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3940           free_data_ref (dr);
3941         }
3942       datarefs.truncate (i);
3943     }
3944
3945   return true;
3946 }
3947
3948
3949 /* Function vect_get_new_vect_var.
3950
3951    Returns a name for a new variable.  The current naming scheme appends the
3952    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3953    the name of vectorizer generated variables, and appends that to NAME if
3954    provided.  */
3955
3956 tree
3957 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3958 {
3959   const char *prefix;
3960   tree new_vect_var;
3961
3962   switch (var_kind)
3963   {
3964   case vect_simple_var:
3965     prefix = "vect";
3966     break;
3967   case vect_scalar_var:
3968     prefix = "stmp";
3969     break;
3970   case vect_mask_var:
3971     prefix = "mask";
3972     break;
3973   case vect_pointer_var:
3974     prefix = "vectp";
3975     break;
3976   default:
3977     gcc_unreachable ();
3978   }
3979
3980   if (name)
3981     {
3982       char* tmp = concat (prefix, "_", name, NULL);
3983       new_vect_var = create_tmp_reg (type, tmp);
3984       free (tmp);
3985     }
3986   else
3987     new_vect_var = create_tmp_reg (type, prefix);
3988
3989   return new_vect_var;
3990 }
3991
3992 /* Like vect_get_new_vect_var but return an SSA name.  */
3993
3994 tree
3995 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3996 {
3997   const char *prefix;
3998   tree new_vect_var;
3999
4000   switch (var_kind)
4001   {
4002   case vect_simple_var:
4003     prefix = "vect";
4004     break;
4005   case vect_scalar_var:
4006     prefix = "stmp";
4007     break;
4008   case vect_pointer_var:
4009     prefix = "vectp";
4010     break;
4011   default:
4012     gcc_unreachable ();
4013   }
4014
4015   if (name)
4016     {
4017       char* tmp = concat (prefix, "_", name, NULL);
4018       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4019       free (tmp);
4020     }
4021   else
4022     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4023
4024   return new_vect_var;
4025 }
4026
4027 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
4028
4029 static void
4030 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4031 {
4032   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4033   int misalign = DR_MISALIGNMENT (dr);
4034   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4035     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4036   else
4037     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4038                             DR_TARGET_ALIGNMENT (dr), misalign);
4039 }
4040
4041 /* Function vect_create_addr_base_for_vector_ref.
4042
4043    Create an expression that computes the address of the first memory location
4044    that will be accessed for a data reference.
4045
4046    Input:
4047    STMT: The statement containing the data reference.
4048    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4049    OFFSET: Optional. If supplied, it is be added to the initial address.
4050    LOOP:    Specify relative to which loop-nest should the address be computed.
4051             For example, when the dataref is in an inner-loop nested in an
4052             outer-loop that is now being vectorized, LOOP can be either the
4053             outer-loop, or the inner-loop.  The first memory location accessed
4054             by the following dataref ('in' points to short):
4055
4056                 for (i=0; i<N; i++)
4057                    for (j=0; j<M; j++)
4058                      s += in[i+j]
4059
4060             is as follows:
4061             if LOOP=i_loop:     &in             (relative to i_loop)
4062             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4063    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4064             initial address.  Unlike OFFSET, which is number of elements to
4065             be added, BYTE_OFFSET is measured in bytes.
4066
4067    Output:
4068    1. Return an SSA_NAME whose value is the address of the memory location of
4069       the first vector of the data reference.
4070    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4071       these statement(s) which define the returned SSA_NAME.
4072
4073    FORNOW: We are only handling array accesses with step 1.  */
4074
4075 tree
4076 vect_create_addr_base_for_vector_ref (gimple *stmt,
4077                                       gimple_seq *new_stmt_list,
4078                                       tree offset,
4079                                       tree byte_offset)
4080 {
4081   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4082   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4083   const char *base_name;
4084   tree addr_base;
4085   tree dest;
4086   gimple_seq seq = NULL;
4087   tree vect_ptr_type;
4088   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4089   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4090   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4091
4092   tree data_ref_base = unshare_expr (drb->base_address);
4093   tree base_offset = unshare_expr (drb->offset);
4094   tree init = unshare_expr (drb->init);
4095
4096   if (loop_vinfo)
4097     base_name = get_name (data_ref_base);
4098   else
4099     {
4100       base_offset = ssize_int (0);
4101       init = ssize_int (0);
4102       base_name = get_name (DR_REF (dr));
4103     }
4104
4105   /* Create base_offset */
4106   base_offset = size_binop (PLUS_EXPR,
4107                             fold_convert (sizetype, base_offset),
4108                             fold_convert (sizetype, init));
4109
4110   if (offset)
4111     {
4112       offset = fold_build2 (MULT_EXPR, sizetype,
4113                             fold_convert (sizetype, offset), step);
4114       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4115                                  base_offset, offset);
4116     }
4117   if (byte_offset)
4118     {
4119       byte_offset = fold_convert (sizetype, byte_offset);
4120       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4121                                  base_offset, byte_offset);
4122     }
4123
4124   /* base + base_offset */
4125   if (loop_vinfo)
4126     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4127   else
4128     {
4129       addr_base = build1 (ADDR_EXPR,
4130                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4131                           unshare_expr (DR_REF (dr)));
4132     }
4133
4134   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4135   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4136   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4137   gimple_seq_add_seq (new_stmt_list, seq);
4138
4139   if (DR_PTR_INFO (dr)
4140       && TREE_CODE (addr_base) == SSA_NAME
4141       && !SSA_NAME_PTR_INFO (addr_base))
4142     {
4143       vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4144       if (offset || byte_offset)
4145         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4146     }
4147
4148   if (dump_enabled_p ())
4149     {
4150       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4151       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4152       dump_printf (MSG_NOTE, "\n");
4153     }
4154
4155   return addr_base;
4156 }
4157
4158
4159 /* Function vect_create_data_ref_ptr.
4160
4161    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4162    location accessed in the loop by STMT, along with the def-use update
4163    chain to appropriately advance the pointer through the loop iterations.
4164    Also set aliasing information for the pointer.  This pointer is used by
4165    the callers to this function to create a memory reference expression for
4166    vector load/store access.
4167
4168    Input:
4169    1. STMT: a stmt that references memory. Expected to be of the form
4170          GIMPLE_ASSIGN <name, data-ref> or
4171          GIMPLE_ASSIGN <data-ref, name>.
4172    2. AGGR_TYPE: the type of the reference, which should be either a vector
4173         or an array.
4174    3. AT_LOOP: the loop where the vector memref is to be created.
4175    4. OFFSET (optional): an offset to be added to the initial address accessed
4176         by the data-ref in STMT.
4177    5. BSI: location where the new stmts are to be placed if there is no loop
4178    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4179         pointing to the initial address.
4180    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4181         to the initial address accessed by the data-ref in STMT.  This is
4182         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4183         in bytes.
4184
4185    Output:
4186    1. Declare a new ptr to vector_type, and have it point to the base of the
4187       data reference (initial addressed accessed by the data reference).
4188       For example, for vector of type V8HI, the following code is generated:
4189
4190       v8hi *ap;
4191       ap = (v8hi *)initial_address;
4192
4193       if OFFSET is not supplied:
4194          initial_address = &a[init];
4195       if OFFSET is supplied:
4196          initial_address = &a[init + OFFSET];
4197       if BYTE_OFFSET is supplied:
4198          initial_address = &a[init] + BYTE_OFFSET;
4199
4200       Return the initial_address in INITIAL_ADDRESS.
4201
4202    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4203       update the pointer in each iteration of the loop.
4204
4205       Return the increment stmt that updates the pointer in PTR_INCR.
4206
4207    3. Set INV_P to true if the access pattern of the data reference in the
4208       vectorized loop is invariant.  Set it to false otherwise.
4209
4210    4. Return the pointer.  */
4211
4212 tree
4213 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4214                           tree offset, tree *initial_address,
4215                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4216                           bool only_init, bool *inv_p, tree byte_offset)
4217 {
4218   const char *base_name;
4219   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4220   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4221   struct loop *loop = NULL;
4222   bool nested_in_vect_loop = false;
4223   struct loop *containing_loop = NULL;
4224   tree aggr_ptr_type;
4225   tree aggr_ptr;
4226   tree new_temp;
4227   gimple_seq new_stmt_list = NULL;
4228   edge pe = NULL;
4229   basic_block new_bb;
4230   tree aggr_ptr_init;
4231   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4232   tree aptr;
4233   gimple_stmt_iterator incr_gsi;
4234   bool insert_after;
4235   tree indx_before_incr, indx_after_incr;
4236   gimple *incr;
4237   tree step;
4238   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4239
4240   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4241               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4242
4243   if (loop_vinfo)
4244     {
4245       loop = LOOP_VINFO_LOOP (loop_vinfo);
4246       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4247       containing_loop = (gimple_bb (stmt))->loop_father;
4248       pe = loop_preheader_edge (loop);
4249     }
4250   else
4251     {
4252       gcc_assert (bb_vinfo);
4253       only_init = true;
4254       *ptr_incr = NULL;
4255     }
4256
4257   /* Check the step (evolution) of the load in LOOP, and record
4258      whether it's invariant.  */
4259   step = vect_dr_behavior (dr)->step;
4260   if (integer_zerop (step))
4261     *inv_p = true;
4262   else
4263     *inv_p = false;
4264
4265   /* Create an expression for the first address accessed by this load
4266      in LOOP.  */
4267   base_name = get_name (DR_BASE_ADDRESS (dr));
4268
4269   if (dump_enabled_p ())
4270     {
4271       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4272       dump_printf_loc (MSG_NOTE, vect_location,
4273                        "create %s-pointer variable to type: ",
4274                        get_tree_code_name (TREE_CODE (aggr_type)));
4275       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4276       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4277         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4278       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4279         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4280       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4281         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4282       else
4283         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4284       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4285       dump_printf (MSG_NOTE, "\n");
4286     }
4287
4288   /* (1) Create the new aggregate-pointer variable.
4289      Vector and array types inherit the alias set of their component
4290      type by default so we need to use a ref-all pointer if the data
4291      reference does not conflict with the created aggregated data
4292      reference because it is not addressable.  */
4293   bool need_ref_all = false;
4294   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4295                               get_alias_set (DR_REF (dr))))
4296     need_ref_all = true;
4297   /* Likewise for any of the data references in the stmt group.  */
4298   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4299     {
4300       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4301       do
4302         {
4303           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4304           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4305           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4306                                       get_alias_set (DR_REF (sdr))))
4307             {
4308               need_ref_all = true;
4309               break;
4310             }
4311           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4312         }
4313       while (orig_stmt);
4314     }
4315   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4316                                                need_ref_all);
4317   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4318
4319
4320   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4321      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4322      def-use update cycles for the pointer: one relative to the outer-loop
4323      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4324      to the inner-loop (which is the inner-most loop containing the dataref),
4325      and this is done be step (5) below.
4326
4327      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4328      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4329      redundant.  Steps (3),(4) create the following:
4330
4331         vp0 = &base_addr;
4332         LOOP:   vp1 = phi(vp0,vp2)
4333                 ...
4334                 ...
4335                 vp2 = vp1 + step
4336                 goto LOOP
4337
4338      If there is an inner-loop nested in loop, then step (5) will also be
4339      applied, and an additional update in the inner-loop will be created:
4340
4341         vp0 = &base_addr;
4342         LOOP:   vp1 = phi(vp0,vp2)
4343                 ...
4344         inner:     vp3 = phi(vp1,vp4)
4345                    vp4 = vp3 + inner_step
4346                    if () goto inner
4347                 ...
4348                 vp2 = vp1 + step
4349                 if () goto LOOP   */
4350
4351   /* (2) Calculate the initial address of the aggregate-pointer, and set
4352      the aggregate-pointer to point to it before the loop.  */
4353
4354   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4355
4356   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4357                                                    offset, byte_offset);
4358   if (new_stmt_list)
4359     {
4360       if (pe)
4361         {
4362           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4363           gcc_assert (!new_bb);
4364         }
4365       else
4366         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4367     }
4368
4369   *initial_address = new_temp;
4370   aggr_ptr_init = new_temp;
4371
4372   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4373      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4374      inner-loop nested in LOOP (during outer-loop vectorization).  */
4375
4376   /* No update in loop is required.  */
4377   if (only_init && (!loop_vinfo || at_loop == loop))
4378     aptr = aggr_ptr_init;
4379   else
4380     {
4381       /* The step of the aggregate pointer is the type size.  */
4382       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4383       /* One exception to the above is when the scalar step of the load in
4384          LOOP is zero. In this case the step here is also zero.  */
4385       if (*inv_p)
4386         iv_step = size_zero_node;
4387       else if (tree_int_cst_sgn (step) == -1)
4388         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4389
4390       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4391
4392       create_iv (aggr_ptr_init,
4393                  fold_convert (aggr_ptr_type, iv_step),
4394                  aggr_ptr, loop, &incr_gsi, insert_after,
4395                  &indx_before_incr, &indx_after_incr);
4396       incr = gsi_stmt (incr_gsi);
4397       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4398
4399       /* Copy the points-to information if it exists. */
4400       if (DR_PTR_INFO (dr))
4401         {
4402           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4403           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4404         }
4405       if (ptr_incr)
4406         *ptr_incr = incr;
4407
4408       aptr = indx_before_incr;
4409     }
4410
4411   if (!nested_in_vect_loop || only_init)
4412     return aptr;
4413
4414
4415   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4416      nested in LOOP, if exists.  */
4417
4418   gcc_assert (nested_in_vect_loop);
4419   if (!only_init)
4420     {
4421       standard_iv_increment_position (containing_loop, &incr_gsi,
4422                                       &insert_after);
4423       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4424                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4425                  &indx_after_incr);
4426       incr = gsi_stmt (incr_gsi);
4427       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4428
4429       /* Copy the points-to information if it exists. */
4430       if (DR_PTR_INFO (dr))
4431         {
4432           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4433           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4434         }
4435       if (ptr_incr)
4436         *ptr_incr = incr;
4437
4438       return indx_before_incr;
4439     }
4440   else
4441     gcc_unreachable ();
4442 }
4443
4444
4445 /* Function bump_vector_ptr
4446
4447    Increment a pointer (to a vector type) by vector-size. If requested,
4448    i.e. if PTR-INCR is given, then also connect the new increment stmt
4449    to the existing def-use update-chain of the pointer, by modifying
4450    the PTR_INCR as illustrated below:
4451
4452    The pointer def-use update-chain before this function:
4453                         DATAREF_PTR = phi (p_0, p_2)
4454                         ....
4455         PTR_INCR:       p_2 = DATAREF_PTR + step
4456
4457    The pointer def-use update-chain after this function:
4458                         DATAREF_PTR = phi (p_0, p_2)
4459                         ....
4460                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4461                         ....
4462         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4463
4464    Input:
4465    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4466                  in the loop.
4467    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4468               the loop.  The increment amount across iterations is expected
4469               to be vector_size.
4470    BSI - location where the new update stmt is to be placed.
4471    STMT - the original scalar memory-access stmt that is being vectorized.
4472    BUMP - optional. The offset by which to bump the pointer. If not given,
4473           the offset is assumed to be vector_size.
4474
4475    Output: Return NEW_DATAREF_PTR as illustrated above.
4476
4477 */
4478
4479 tree
4480 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4481                  gimple *stmt, tree bump)
4482 {
4483   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4484   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4485   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4486   tree update = TYPE_SIZE_UNIT (vectype);
4487   gassign *incr_stmt;
4488   ssa_op_iter iter;
4489   use_operand_p use_p;
4490   tree new_dataref_ptr;
4491
4492   if (bump)
4493     update = bump;
4494
4495   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4496     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4497   else
4498     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4499   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4500                                    dataref_ptr, update);
4501   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4502
4503   /* Copy the points-to information if it exists. */
4504   if (DR_PTR_INFO (dr))
4505     {
4506       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4507       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4508     }
4509
4510   if (!ptr_incr)
4511     return new_dataref_ptr;
4512
4513   /* Update the vector-pointer's cross-iteration increment.  */
4514   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4515     {
4516       tree use = USE_FROM_PTR (use_p);
4517
4518       if (use == dataref_ptr)
4519         SET_USE (use_p, new_dataref_ptr);
4520       else
4521         gcc_assert (tree_int_cst_compare (use, update) == 0);
4522     }
4523
4524   return new_dataref_ptr;
4525 }
4526
4527
4528 /* Function vect_create_destination_var.
4529
4530    Create a new temporary of type VECTYPE.  */
4531
4532 tree
4533 vect_create_destination_var (tree scalar_dest, tree vectype)
4534 {
4535   tree vec_dest;
4536   const char *name;
4537   char *new_name;
4538   tree type;
4539   enum vect_var_kind kind;
4540
4541   kind = vectype
4542     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4543     ? vect_mask_var
4544     : vect_simple_var
4545     : vect_scalar_var;
4546   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4547
4548   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4549
4550   name = get_name (scalar_dest);
4551   if (name)
4552     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4553   else
4554     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4555   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4556   free (new_name);
4557
4558   return vec_dest;
4559 }
4560
4561 /* Function vect_grouped_store_supported.
4562
4563    Returns TRUE if interleave high and interleave low permutations
4564    are supported, and FALSE otherwise.  */
4565
4566 bool
4567 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4568 {
4569   machine_mode mode = TYPE_MODE (vectype);
4570
4571   /* vect_permute_store_chain requires the group size to be equal to 3 or
4572      be a power of two.  */
4573   if (count != 3 && exact_log2 (count) == -1)
4574     {
4575       if (dump_enabled_p ())
4576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4577                          "the size of the group of accesses"
4578                          " is not a power of 2 or not eqaul to 3\n");
4579       return false;
4580     }
4581
4582   /* Check that the permutation is supported.  */
4583   if (VECTOR_MODE_P (mode))
4584     {
4585       unsigned int i;
4586       if (count == 3)
4587         {
4588           unsigned int j0 = 0, j1 = 0, j2 = 0;
4589           unsigned int i, j;
4590
4591           unsigned int nelt;
4592           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4593             {
4594               if (dump_enabled_p ())
4595                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4596                                  "cannot handle groups of 3 stores for"
4597                                  " variable-length vectors\n");
4598               return false;
4599             }
4600
4601           vec_perm_builder sel (nelt, nelt, 1);
4602           sel.quick_grow (nelt);
4603           vec_perm_indices indices;
4604           for (j = 0; j < 3; j++)
4605             {
4606               int nelt0 = ((3 - j) * nelt) % 3;
4607               int nelt1 = ((3 - j) * nelt + 1) % 3;
4608               int nelt2 = ((3 - j) * nelt + 2) % 3;
4609               for (i = 0; i < nelt; i++)
4610                 {
4611                   if (3 * i + nelt0 < nelt)
4612                     sel[3 * i + nelt0] = j0++;
4613                   if (3 * i + nelt1 < nelt)
4614                     sel[3 * i + nelt1] = nelt + j1++;
4615                   if (3 * i + nelt2 < nelt)
4616                     sel[3 * i + nelt2] = 0;
4617                 }
4618               indices.new_vector (sel, 2, nelt);
4619               if (!can_vec_perm_const_p (mode, indices))
4620                 {
4621                   if (dump_enabled_p ())
4622                     dump_printf (MSG_MISSED_OPTIMIZATION,
4623                                  "permutation op not supported by target.\n");
4624                   return false;
4625                 }
4626
4627               for (i = 0; i < nelt; i++)
4628                 {
4629                   if (3 * i + nelt0 < nelt)
4630                     sel[3 * i + nelt0] = 3 * i + nelt0;
4631                   if (3 * i + nelt1 < nelt)
4632                     sel[3 * i + nelt1] = 3 * i + nelt1;
4633                   if (3 * i + nelt2 < nelt)
4634                     sel[3 * i + nelt2] = nelt + j2++;
4635                 }
4636               indices.new_vector (sel, 2, nelt);
4637               if (!can_vec_perm_const_p (mode, indices))
4638                 {
4639                   if (dump_enabled_p ())
4640                     dump_printf (MSG_MISSED_OPTIMIZATION,
4641                                  "permutation op not supported by target.\n");
4642                   return false;
4643                 }
4644             }
4645           return true;
4646         }
4647       else
4648         {
4649           /* If length is not equal to 3 then only power of 2 is supported.  */
4650           gcc_assert (pow2p_hwi (count));
4651           poly_uint64 nelt = GET_MODE_NUNITS (mode);
4652
4653           /* The encoding has 2 interleaved stepped patterns.  */
4654           vec_perm_builder sel (nelt, 2, 3);
4655           sel.quick_grow (6);
4656           for (i = 0; i < 3; i++)
4657             {
4658               sel[i * 2] = i;
4659               sel[i * 2 + 1] = i + nelt;
4660             }
4661           vec_perm_indices indices (sel, 2, nelt);
4662           if (can_vec_perm_const_p (mode, indices))
4663             {
4664               for (i = 0; i < 6; i++)
4665                 sel[i] += exact_div (nelt, 2);
4666               indices.new_vector (sel, 2, nelt);
4667               if (can_vec_perm_const_p (mode, indices))
4668                 return true;
4669             }
4670         }
4671     }
4672
4673   if (dump_enabled_p ())
4674     dump_printf (MSG_MISSED_OPTIMIZATION,
4675                  "permutaion op not supported by target.\n");
4676   return false;
4677 }
4678
4679
4680 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4681    type VECTYPE.  */
4682
4683 bool
4684 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4685 {
4686   return vect_lanes_optab_supported_p ("vec_store_lanes",
4687                                        vec_store_lanes_optab,
4688                                        vectype, count);
4689 }
4690
4691
4692 /* Function vect_permute_store_chain.
4693
4694    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4695    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4696    the data correctly for the stores.  Return the final references for stores
4697    in RESULT_CHAIN.
4698
4699    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4700    The input is 4 vectors each containing 8 elements.  We assign a number to
4701    each element, the input sequence is:
4702
4703    1st vec:   0  1  2  3  4  5  6  7
4704    2nd vec:   8  9 10 11 12 13 14 15
4705    3rd vec:  16 17 18 19 20 21 22 23
4706    4th vec:  24 25 26 27 28 29 30 31
4707
4708    The output sequence should be:
4709
4710    1st vec:  0  8 16 24  1  9 17 25
4711    2nd vec:  2 10 18 26  3 11 19 27
4712    3rd vec:  4 12 20 28  5 13 21 30
4713    4th vec:  6 14 22 30  7 15 23 31
4714
4715    i.e., we interleave the contents of the four vectors in their order.
4716
4717    We use interleave_high/low instructions to create such output.  The input of
4718    each interleave_high/low operation is two vectors:
4719    1st vec    2nd vec
4720    0 1 2 3    4 5 6 7
4721    the even elements of the result vector are obtained left-to-right from the
4722    high/low elements of the first vector.  The odd elements of the result are
4723    obtained left-to-right from the high/low elements of the second vector.
4724    The output of interleave_high will be:   0 4 1 5
4725    and of interleave_low:                   2 6 3 7
4726
4727
4728    The permutation is done in log LENGTH stages.  In each stage interleave_high
4729    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4730    where the first argument is taken from the first half of DR_CHAIN and the
4731    second argument from it's second half.
4732    In our example,
4733
4734    I1: interleave_high (1st vec, 3rd vec)
4735    I2: interleave_low (1st vec, 3rd vec)
4736    I3: interleave_high (2nd vec, 4th vec)
4737    I4: interleave_low (2nd vec, 4th vec)
4738
4739    The output for the first stage is:
4740
4741    I1:  0 16  1 17  2 18  3 19
4742    I2:  4 20  5 21  6 22  7 23
4743    I3:  8 24  9 25 10 26 11 27
4744    I4: 12 28 13 29 14 30 15 31
4745
4746    The output of the second stage, i.e. the final result is:
4747
4748    I1:  0  8 16 24  1  9 17 25
4749    I2:  2 10 18 26  3 11 19 27
4750    I3:  4 12 20 28  5 13 21 30
4751    I4:  6 14 22 30  7 15 23 31.  */
4752
4753 void
4754 vect_permute_store_chain (vec<tree> dr_chain,
4755                           unsigned int length,
4756                           gimple *stmt,
4757                           gimple_stmt_iterator *gsi,
4758                           vec<tree> *result_chain)
4759 {
4760   tree vect1, vect2, high, low;
4761   gimple *perm_stmt;
4762   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4763   tree perm_mask_low, perm_mask_high;
4764   tree data_ref;
4765   tree perm3_mask_low, perm3_mask_high;
4766   unsigned int i, n, log_length = exact_log2 (length);
4767   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4768
4769   result_chain->quick_grow (length);
4770   memcpy (result_chain->address (), dr_chain.address (),
4771           length * sizeof (tree));
4772
4773   if (length == 3)
4774     {
4775       unsigned int j0 = 0, j1 = 0, j2 = 0;
4776
4777       vec_perm_builder sel (nelt, nelt, 1);
4778       sel.quick_grow (nelt);
4779       vec_perm_indices indices;
4780       for (j = 0; j < 3; j++)
4781         {
4782           int nelt0 = ((3 - j) * nelt) % 3;
4783           int nelt1 = ((3 - j) * nelt + 1) % 3;
4784           int nelt2 = ((3 - j) * nelt + 2) % 3;
4785
4786           for (i = 0; i < nelt; i++)
4787             {
4788               if (3 * i + nelt0 < nelt)
4789                 sel[3 * i + nelt0] = j0++;
4790               if (3 * i + nelt1 < nelt)
4791                 sel[3 * i + nelt1] = nelt + j1++;
4792               if (3 * i + nelt2 < nelt)
4793                 sel[3 * i + nelt2] = 0;
4794             }
4795           indices.new_vector (sel, 2, nelt);
4796           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4797
4798           for (i = 0; i < nelt; i++)
4799             {
4800               if (3 * i + nelt0 < nelt)
4801                 sel[3 * i + nelt0] = 3 * i + nelt0;
4802               if (3 * i + nelt1 < nelt)
4803                 sel[3 * i + nelt1] = 3 * i + nelt1;
4804               if (3 * i + nelt2 < nelt)
4805                 sel[3 * i + nelt2] = nelt + j2++;
4806             }
4807           indices.new_vector (sel, 2, nelt);
4808           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4809
4810           vect1 = dr_chain[0];
4811           vect2 = dr_chain[1];
4812
4813           /* Create interleaving stmt:
4814              low = VEC_PERM_EXPR <vect1, vect2,
4815                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4816                                    j + 2, nelt + j + 2, *, ...}>  */
4817           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4818           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4819                                            vect2, perm3_mask_low);
4820           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4821
4822           vect1 = data_ref;
4823           vect2 = dr_chain[2];
4824           /* Create interleaving stmt:
4825              low = VEC_PERM_EXPR <vect1, vect2,
4826                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4827                                    6, 7, nelt + j + 2, ...}>  */
4828           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4829           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4830                                            vect2, perm3_mask_high);
4831           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4832           (*result_chain)[j] = data_ref;
4833         }
4834     }
4835   else
4836     {
4837       /* If length is not equal to 3 then only power of 2 is supported.  */
4838       gcc_assert (pow2p_hwi (length));
4839
4840       /* The encoding has 2 interleaved stepped patterns.  */
4841       vec_perm_builder sel (nelt, 2, 3);
4842       sel.quick_grow (6);
4843       for (i = 0; i < 3; i++)
4844         {
4845           sel[i * 2] = i;
4846           sel[i * 2 + 1] = i + nelt;
4847         }
4848         vec_perm_indices indices (sel, 2, nelt);
4849         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
4850
4851         for (i = 0; i < 6; i++)
4852           sel[i] += nelt / 2;
4853         indices.new_vector (sel, 2, nelt);
4854         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4855
4856         for (i = 0, n = log_length; i < n; i++)
4857           {
4858             for (j = 0; j < length/2; j++)
4859               {
4860                 vect1 = dr_chain[j];
4861                 vect2 = dr_chain[j+length/2];
4862
4863                 /* Create interleaving stmt:
4864                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4865                                                         ...}>  */
4866                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4867                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4868                                                  vect2, perm_mask_high);
4869                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4870                 (*result_chain)[2*j] = high;
4871
4872                 /* Create interleaving stmt:
4873                    low = VEC_PERM_EXPR <vect1, vect2,
4874                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4875                                          ...}>  */
4876                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4877                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4878                                                  vect2, perm_mask_low);
4879                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4880                 (*result_chain)[2*j+1] = low;
4881               }
4882             memcpy (dr_chain.address (), result_chain->address (),
4883                     length * sizeof (tree));
4884           }
4885     }
4886 }
4887
4888 /* Function vect_setup_realignment
4889
4890    This function is called when vectorizing an unaligned load using
4891    the dr_explicit_realign[_optimized] scheme.
4892    This function generates the following code at the loop prolog:
4893
4894       p = initial_addr;
4895    x  msq_init = *(floor(p));   # prolog load
4896       realignment_token = call target_builtin;
4897     loop:
4898    x  msq = phi (msq_init, ---)
4899
4900    The stmts marked with x are generated only for the case of
4901    dr_explicit_realign_optimized.
4902
4903    The code above sets up a new (vector) pointer, pointing to the first
4904    location accessed by STMT, and a "floor-aligned" load using that pointer.
4905    It also generates code to compute the "realignment-token" (if the relevant
4906    target hook was defined), and creates a phi-node at the loop-header bb
4907    whose arguments are the result of the prolog-load (created by this
4908    function) and the result of a load that takes place in the loop (to be
4909    created by the caller to this function).
4910
4911    For the case of dr_explicit_realign_optimized:
4912    The caller to this function uses the phi-result (msq) to create the
4913    realignment code inside the loop, and sets up the missing phi argument,
4914    as follows:
4915     loop:
4916       msq = phi (msq_init, lsq)
4917       lsq = *(floor(p'));        # load in loop
4918       result = realign_load (msq, lsq, realignment_token);
4919
4920    For the case of dr_explicit_realign:
4921     loop:
4922       msq = *(floor(p));        # load in loop
4923       p' = p + (VS-1);
4924       lsq = *(floor(p'));       # load in loop
4925       result = realign_load (msq, lsq, realignment_token);
4926
4927    Input:
4928    STMT - (scalar) load stmt to be vectorized. This load accesses
4929           a memory location that may be unaligned.
4930    BSI - place where new code is to be inserted.
4931    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4932                               is used.
4933
4934    Output:
4935    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4936                        target hook, if defined.
4937    Return value - the result of the loop-header phi node.  */
4938
4939 tree
4940 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4941                         tree *realignment_token,
4942                         enum dr_alignment_support alignment_support_scheme,
4943                         tree init_addr,
4944                         struct loop **at_loop)
4945 {
4946   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4947   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4948   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4949   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4950   struct loop *loop = NULL;
4951   edge pe = NULL;
4952   tree scalar_dest = gimple_assign_lhs (stmt);
4953   tree vec_dest;
4954   gimple *inc;
4955   tree ptr;
4956   tree data_ref;
4957   basic_block new_bb;
4958   tree msq_init = NULL_TREE;
4959   tree new_temp;
4960   gphi *phi_stmt;
4961   tree msq = NULL_TREE;
4962   gimple_seq stmts = NULL;
4963   bool inv_p;
4964   bool compute_in_loop = false;
4965   bool nested_in_vect_loop = false;
4966   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4967   struct loop *loop_for_initial_load = NULL;
4968
4969   if (loop_vinfo)
4970     {
4971       loop = LOOP_VINFO_LOOP (loop_vinfo);
4972       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4973     }
4974
4975   gcc_assert (alignment_support_scheme == dr_explicit_realign
4976               || alignment_support_scheme == dr_explicit_realign_optimized);
4977
4978   /* We need to generate three things:
4979      1. the misalignment computation
4980      2. the extra vector load (for the optimized realignment scheme).
4981      3. the phi node for the two vectors from which the realignment is
4982       done (for the optimized realignment scheme).  */
4983
4984   /* 1. Determine where to generate the misalignment computation.
4985
4986      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4987      calculation will be generated by this function, outside the loop (in the
4988      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4989      caller, inside the loop.
4990
4991      Background: If the misalignment remains fixed throughout the iterations of
4992      the loop, then both realignment schemes are applicable, and also the
4993      misalignment computation can be done outside LOOP.  This is because we are
4994      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4995      are a multiple of VS (the Vector Size), and therefore the misalignment in
4996      different vectorized LOOP iterations is always the same.
4997      The problem arises only if the memory access is in an inner-loop nested
4998      inside LOOP, which is now being vectorized using outer-loop vectorization.
4999      This is the only case when the misalignment of the memory access may not
5000      remain fixed throughout the iterations of the inner-loop (as explained in
5001      detail in vect_supportable_dr_alignment).  In this case, not only is the
5002      optimized realignment scheme not applicable, but also the misalignment
5003      computation (and generation of the realignment token that is passed to
5004      REALIGN_LOAD) have to be done inside the loop.
5005
5006      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5007      or not, which in turn determines if the misalignment is computed inside
5008      the inner-loop, or outside LOOP.  */
5009
5010   if (init_addr != NULL_TREE || !loop_vinfo)
5011     {
5012       compute_in_loop = true;
5013       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5014     }
5015
5016
5017   /* 2. Determine where to generate the extra vector load.
5018
5019      For the optimized realignment scheme, instead of generating two vector
5020      loads in each iteration, we generate a single extra vector load in the
5021      preheader of the loop, and in each iteration reuse the result of the
5022      vector load from the previous iteration.  In case the memory access is in
5023      an inner-loop nested inside LOOP, which is now being vectorized using
5024      outer-loop vectorization, we need to determine whether this initial vector
5025      load should be generated at the preheader of the inner-loop, or can be
5026      generated at the preheader of LOOP.  If the memory access has no evolution
5027      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5028      to be generated inside LOOP (in the preheader of the inner-loop).  */
5029
5030   if (nested_in_vect_loop)
5031     {
5032       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5033       bool invariant_in_outerloop =
5034             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5035       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5036     }
5037   else
5038     loop_for_initial_load = loop;
5039   if (at_loop)
5040     *at_loop = loop_for_initial_load;
5041
5042   if (loop_for_initial_load)
5043     pe = loop_preheader_edge (loop_for_initial_load);
5044
5045   /* 3. For the case of the optimized realignment, create the first vector
5046       load at the loop preheader.  */
5047
5048   if (alignment_support_scheme == dr_explicit_realign_optimized)
5049     {
5050       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5051       gassign *new_stmt;
5052
5053       gcc_assert (!compute_in_loop);
5054       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5055       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5056                                       NULL_TREE, &init_addr, NULL, &inc,
5057                                       true, &inv_p);
5058       if (TREE_CODE (ptr) == SSA_NAME)
5059         new_temp = copy_ssa_name (ptr);
5060       else
5061         new_temp = make_ssa_name (TREE_TYPE (ptr));
5062       unsigned int align = DR_TARGET_ALIGNMENT (dr);
5063       new_stmt = gimple_build_assign
5064                    (new_temp, BIT_AND_EXPR, ptr,
5065                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5066       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5067       gcc_assert (!new_bb);
5068       data_ref
5069         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5070                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5071       new_stmt = gimple_build_assign (vec_dest, data_ref);
5072       new_temp = make_ssa_name (vec_dest, new_stmt);
5073       gimple_assign_set_lhs (new_stmt, new_temp);
5074       if (pe)
5075         {
5076           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5077           gcc_assert (!new_bb);
5078         }
5079       else
5080          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5081
5082       msq_init = gimple_assign_lhs (new_stmt);
5083     }
5084
5085   /* 4. Create realignment token using a target builtin, if available.
5086       It is done either inside the containing loop, or before LOOP (as
5087       determined above).  */
5088
5089   if (targetm.vectorize.builtin_mask_for_load)
5090     {
5091       gcall *new_stmt;
5092       tree builtin_decl;
5093
5094       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5095       if (!init_addr)
5096         {
5097           /* Generate the INIT_ADDR computation outside LOOP.  */
5098           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5099                                                             NULL_TREE);
5100           if (loop)
5101             {
5102               pe = loop_preheader_edge (loop);
5103               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5104               gcc_assert (!new_bb);
5105             }
5106           else
5107              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5108         }
5109
5110       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5111       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5112       vec_dest =
5113         vect_create_destination_var (scalar_dest,
5114                                      gimple_call_return_type (new_stmt));
5115       new_temp = make_ssa_name (vec_dest, new_stmt);
5116       gimple_call_set_lhs (new_stmt, new_temp);
5117
5118       if (compute_in_loop)
5119         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5120       else
5121         {
5122           /* Generate the misalignment computation outside LOOP.  */
5123           pe = loop_preheader_edge (loop);
5124           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5125           gcc_assert (!new_bb);
5126         }
5127
5128       *realignment_token = gimple_call_lhs (new_stmt);
5129
5130       /* The result of the CALL_EXPR to this builtin is determined from
5131          the value of the parameter and no global variables are touched
5132          which makes the builtin a "const" function.  Requiring the
5133          builtin to have the "const" attribute makes it unnecessary
5134          to call mark_call_clobbered.  */
5135       gcc_assert (TREE_READONLY (builtin_decl));
5136     }
5137
5138   if (alignment_support_scheme == dr_explicit_realign)
5139     return msq;
5140
5141   gcc_assert (!compute_in_loop);
5142   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5143
5144
5145   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5146
5147   pe = loop_preheader_edge (containing_loop);
5148   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5149   msq = make_ssa_name (vec_dest);
5150   phi_stmt = create_phi_node (msq, containing_loop->header);
5151   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5152
5153   return msq;
5154 }
5155
5156
5157 /* Function vect_grouped_load_supported.
5158
5159    COUNT is the size of the load group (the number of statements plus the
5160    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5161    only one statement, with a gap of COUNT - 1.
5162
5163    Returns true if a suitable permute exists.  */
5164
5165 bool
5166 vect_grouped_load_supported (tree vectype, bool single_element_p,
5167                              unsigned HOST_WIDE_INT count)
5168 {
5169   machine_mode mode = TYPE_MODE (vectype);
5170
5171   /* If this is single-element interleaving with an element distance
5172      that leaves unused vector loads around punt - we at least create
5173      very sub-optimal code in that case (and blow up memory,
5174      see PR65518).  */
5175   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5176     {
5177       if (dump_enabled_p ())
5178         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5179                          "single-element interleaving not supported "
5180                          "for not adjacent vector loads\n");
5181       return false;
5182     }
5183
5184   /* vect_permute_load_chain requires the group size to be equal to 3 or
5185      be a power of two.  */
5186   if (count != 3 && exact_log2 (count) == -1)
5187     {
5188       if (dump_enabled_p ())
5189         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5190                          "the size of the group of accesses"
5191                          " is not a power of 2 or not equal to 3\n");
5192       return false;
5193     }
5194
5195   /* Check that the permutation is supported.  */
5196   if (VECTOR_MODE_P (mode))
5197     {
5198       unsigned int i, j;
5199       if (count == 3)
5200         {
5201           unsigned int nelt;
5202           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5203             {
5204               if (dump_enabled_p ())
5205                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5206                                  "cannot handle groups of 3 loads for"
5207                                  " variable-length vectors\n");
5208               return false;
5209             }
5210
5211           vec_perm_builder sel (nelt, nelt, 1);
5212           sel.quick_grow (nelt);
5213           vec_perm_indices indices;
5214           unsigned int k;
5215           for (k = 0; k < 3; k++)
5216             {
5217               for (i = 0; i < nelt; i++)
5218                 if (3 * i + k < 2 * nelt)
5219                   sel[i] = 3 * i + k;
5220                 else
5221                   sel[i] = 0;
5222               indices.new_vector (sel, 2, nelt);
5223               if (!can_vec_perm_const_p (mode, indices))
5224                 {
5225                   if (dump_enabled_p ())
5226                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5227                                      "shuffle of 3 loads is not supported by"
5228                                      " target\n");
5229                   return false;
5230                 }
5231               for (i = 0, j = 0; i < nelt; i++)
5232                 if (3 * i + k < 2 * nelt)
5233                   sel[i] = i;
5234                 else
5235                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5236               indices.new_vector (sel, 2, nelt);
5237               if (!can_vec_perm_const_p (mode, indices))
5238                 {
5239                   if (dump_enabled_p ())
5240                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5241                                      "shuffle of 3 loads is not supported by"
5242                                      " target\n");
5243                   return false;
5244                 }
5245             }
5246           return true;
5247         }
5248       else
5249         {
5250           /* If length is not equal to 3 then only power of 2 is supported.  */
5251           gcc_assert (pow2p_hwi (count));
5252           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5253
5254           /* The encoding has a single stepped pattern.  */
5255           vec_perm_builder sel (nelt, 1, 3);
5256           sel.quick_grow (3);
5257           for (i = 0; i < 3; i++)
5258             sel[i] = i * 2;
5259           vec_perm_indices indices (sel, 2, nelt);
5260           if (can_vec_perm_const_p (mode, indices))
5261             {
5262               for (i = 0; i < 3; i++)
5263                 sel[i] = i * 2 + 1;
5264               indices.new_vector (sel, 2, nelt);
5265               if (can_vec_perm_const_p (mode, indices))
5266                 return true;
5267             }
5268         }
5269     }
5270
5271   if (dump_enabled_p ())
5272     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5273                      "extract even/odd not supported by target\n");
5274   return false;
5275 }
5276
5277 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5278    type VECTYPE.  */
5279
5280 bool
5281 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5282 {
5283   return vect_lanes_optab_supported_p ("vec_load_lanes",
5284                                        vec_load_lanes_optab,
5285                                        vectype, count);
5286 }
5287
5288 /* Function vect_permute_load_chain.
5289
5290    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5291    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5292    the input data correctly.  Return the final references for loads in
5293    RESULT_CHAIN.
5294
5295    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5296    The input is 4 vectors each containing 8 elements. We assign a number to each
5297    element, the input sequence is:
5298
5299    1st vec:   0  1  2  3  4  5  6  7
5300    2nd vec:   8  9 10 11 12 13 14 15
5301    3rd vec:  16 17 18 19 20 21 22 23
5302    4th vec:  24 25 26 27 28 29 30 31
5303
5304    The output sequence should be:
5305
5306    1st vec:  0 4  8 12 16 20 24 28
5307    2nd vec:  1 5  9 13 17 21 25 29
5308    3rd vec:  2 6 10 14 18 22 26 30
5309    4th vec:  3 7 11 15 19 23 27 31
5310
5311    i.e., the first output vector should contain the first elements of each
5312    interleaving group, etc.
5313
5314    We use extract_even/odd instructions to create such output.  The input of
5315    each extract_even/odd operation is two vectors
5316    1st vec    2nd vec
5317    0 1 2 3    4 5 6 7
5318
5319    and the output is the vector of extracted even/odd elements.  The output of
5320    extract_even will be:   0 2 4 6
5321    and of extract_odd:     1 3 5 7
5322
5323
5324    The permutation is done in log LENGTH stages.  In each stage extract_even
5325    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5326    their order.  In our example,
5327
5328    E1: extract_even (1st vec, 2nd vec)
5329    E2: extract_odd (1st vec, 2nd vec)
5330    E3: extract_even (3rd vec, 4th vec)
5331    E4: extract_odd (3rd vec, 4th vec)
5332
5333    The output for the first stage will be:
5334
5335    E1:  0  2  4  6  8 10 12 14
5336    E2:  1  3  5  7  9 11 13 15
5337    E3: 16 18 20 22 24 26 28 30
5338    E4: 17 19 21 23 25 27 29 31
5339
5340    In order to proceed and create the correct sequence for the next stage (or
5341    for the correct output, if the second stage is the last one, as in our
5342    example), we first put the output of extract_even operation and then the
5343    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5344    The input for the second stage is:
5345
5346    1st vec (E1):  0  2  4  6  8 10 12 14
5347    2nd vec (E3): 16 18 20 22 24 26 28 30
5348    3rd vec (E2):  1  3  5  7  9 11 13 15
5349    4th vec (E4): 17 19 21 23 25 27 29 31
5350
5351    The output of the second stage:
5352
5353    E1: 0 4  8 12 16 20 24 28
5354    E2: 2 6 10 14 18 22 26 30
5355    E3: 1 5  9 13 17 21 25 29
5356    E4: 3 7 11 15 19 23 27 31
5357
5358    And RESULT_CHAIN after reordering:
5359
5360    1st vec (E1):  0 4  8 12 16 20 24 28
5361    2nd vec (E3):  1 5  9 13 17 21 25 29
5362    3rd vec (E2):  2 6 10 14 18 22 26 30
5363    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5364
5365 static void
5366 vect_permute_load_chain (vec<tree> dr_chain,
5367                          unsigned int length,
5368                          gimple *stmt,
5369                          gimple_stmt_iterator *gsi,
5370                          vec<tree> *result_chain)
5371 {
5372   tree data_ref, first_vect, second_vect;
5373   tree perm_mask_even, perm_mask_odd;
5374   tree perm3_mask_low, perm3_mask_high;
5375   gimple *perm_stmt;
5376   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5377   unsigned int i, j, log_length = exact_log2 (length);
5378   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5379
5380   result_chain->quick_grow (length);
5381   memcpy (result_chain->address (), dr_chain.address (),
5382           length * sizeof (tree));
5383
5384   if (length == 3)
5385     {
5386       unsigned int k;
5387
5388       vec_perm_builder sel (nelt, nelt, 1);
5389       sel.quick_grow (nelt);
5390       vec_perm_indices indices;
5391       for (k = 0; k < 3; k++)
5392         {
5393           for (i = 0; i < nelt; i++)
5394             if (3 * i + k < 2 * nelt)
5395               sel[i] = 3 * i + k;
5396             else
5397               sel[i] = 0;
5398           indices.new_vector (sel, 2, nelt);
5399           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5400
5401           for (i = 0, j = 0; i < nelt; i++)
5402             if (3 * i + k < 2 * nelt)
5403               sel[i] = i;
5404             else
5405               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5406           indices.new_vector (sel, 2, nelt);
5407           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5408
5409           first_vect = dr_chain[0];
5410           second_vect = dr_chain[1];
5411
5412           /* Create interleaving stmt (low part of):
5413              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5414                                                              ...}>  */
5415           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5416           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5417                                            second_vect, perm3_mask_low);
5418           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5419
5420           /* Create interleaving stmt (high part of):
5421              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5422                                                               ...}>  */
5423           first_vect = data_ref;
5424           second_vect = dr_chain[2];
5425           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5426           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5427                                            second_vect, perm3_mask_high);
5428           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5429           (*result_chain)[k] = data_ref;
5430         }
5431     }
5432   else
5433     {
5434       /* If length is not equal to 3 then only power of 2 is supported.  */
5435       gcc_assert (pow2p_hwi (length));
5436
5437       /* The encoding has a single stepped pattern.  */
5438       vec_perm_builder sel (nelt, 1, 3);
5439       sel.quick_grow (3);
5440       for (i = 0; i < 3; ++i)
5441         sel[i] = i * 2;
5442       vec_perm_indices indices (sel, 2, nelt);
5443       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5444
5445       for (i = 0; i < 3; ++i)
5446         sel[i] = i * 2 + 1;
5447       indices.new_vector (sel, 2, nelt);
5448       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5449
5450       for (i = 0; i < log_length; i++)
5451         {
5452           for (j = 0; j < length; j += 2)
5453             {
5454               first_vect = dr_chain[j];
5455               second_vect = dr_chain[j+1];
5456
5457               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5458               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5459               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5460                                                first_vect, second_vect,
5461                                                perm_mask_even);
5462               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5463               (*result_chain)[j/2] = data_ref;
5464
5465               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5466               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5467               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5468                                                first_vect, second_vect,
5469                                                perm_mask_odd);
5470               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5471               (*result_chain)[j/2+length/2] = data_ref;
5472             }
5473           memcpy (dr_chain.address (), result_chain->address (),
5474                   length * sizeof (tree));
5475         }
5476     }
5477 }
5478
5479 /* Function vect_shift_permute_load_chain.
5480
5481    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5482    sequence of stmts to reorder the input data accordingly.
5483    Return the final references for loads in RESULT_CHAIN.
5484    Return true if successed, false otherwise.
5485
5486    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5487    The input is 3 vectors each containing 8 elements.  We assign a
5488    number to each element, the input sequence is:
5489
5490    1st vec:   0  1  2  3  4  5  6  7
5491    2nd vec:   8  9 10 11 12 13 14 15
5492    3rd vec:  16 17 18 19 20 21 22 23
5493
5494    The output sequence should be:
5495
5496    1st vec:  0 3 6  9 12 15 18 21
5497    2nd vec:  1 4 7 10 13 16 19 22
5498    3rd vec:  2 5 8 11 14 17 20 23
5499
5500    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5501
5502    First we shuffle all 3 vectors to get correct elements order:
5503
5504    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5505    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5506    3rd vec:  (16 19 22) (17 20 23) (18 21)
5507
5508    Next we unite and shift vector 3 times:
5509
5510    1st step:
5511      shift right by 6 the concatenation of:
5512      "1st vec" and  "2nd vec"
5513        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5514      "2nd vec" and  "3rd vec"
5515        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5516      "3rd vec" and  "1st vec"
5517        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5518                              | New vectors                   |
5519
5520      So that now new vectors are:
5521
5522      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5523      2nd vec:  (10 13) (16 19 22) (17 20 23)
5524      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5525
5526    2nd step:
5527      shift right by 5 the concatenation of:
5528      "1st vec" and  "3rd vec"
5529        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5530      "2nd vec" and  "1st vec"
5531        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5532      "3rd vec" and  "2nd vec"
5533        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5534                           | New vectors                   |
5535
5536      So that now new vectors are:
5537
5538      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5539      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5540      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5541
5542    3rd step:
5543      shift right by 5 the concatenation of:
5544      "1st vec" and  "1st vec"
5545        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5546      shift right by 3 the concatenation of:
5547      "2nd vec" and  "2nd vec"
5548                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5549                           | New vectors                   |
5550
5551      So that now all vectors are READY:
5552      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5553      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5554      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5555
5556    This algorithm is faster than one in vect_permute_load_chain if:
5557      1.  "shift of a concatination" is faster than general permutation.
5558          This is usually so.
5559      2.  The TARGET machine can't execute vector instructions in parallel.
5560          This is because each step of the algorithm depends on previous.
5561          The algorithm in vect_permute_load_chain is much more parallel.
5562
5563    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5564 */
5565
5566 static bool
5567 vect_shift_permute_load_chain (vec<tree> dr_chain,
5568                                unsigned int length,
5569                                gimple *stmt,
5570                                gimple_stmt_iterator *gsi,
5571                                vec<tree> *result_chain)
5572 {
5573   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5574   tree perm2_mask1, perm2_mask2, perm3_mask;
5575   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5576   gimple *perm_stmt;
5577
5578   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5579   unsigned int i;
5580   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5581   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5582   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5583
5584   unsigned HOST_WIDE_INT vf;
5585   if (!LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5586     /* Not supported for variable-length vectors.  */
5587     return false;
5588
5589   vec_perm_builder sel (nelt, nelt, 1);
5590   sel.quick_grow (nelt);
5591
5592   result_chain->quick_grow (length);
5593   memcpy (result_chain->address (), dr_chain.address (),
5594           length * sizeof (tree));
5595
5596   if (pow2p_hwi (length) && vf > 4)
5597     {
5598       unsigned int j, log_length = exact_log2 (length);
5599       for (i = 0; i < nelt / 2; ++i)
5600         sel[i] = i * 2;
5601       for (i = 0; i < nelt / 2; ++i)
5602         sel[nelt / 2 + i] = i * 2 + 1;
5603       vec_perm_indices indices (sel, 2, nelt);
5604       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5605         {
5606           if (dump_enabled_p ())
5607             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5608                              "shuffle of 2 fields structure is not \
5609                               supported by target\n");
5610           return false;
5611         }
5612       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5613
5614       for (i = 0; i < nelt / 2; ++i)
5615         sel[i] = i * 2 + 1;
5616       for (i = 0; i < nelt / 2; ++i)
5617         sel[nelt / 2 + i] = i * 2;
5618       indices.new_vector (sel, 2, nelt);
5619       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5620         {
5621           if (dump_enabled_p ())
5622             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5623                              "shuffle of 2 fields structure is not \
5624                               supported by target\n");
5625           return false;
5626         }
5627       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5628
5629       /* Generating permutation constant to shift all elements.
5630          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5631       for (i = 0; i < nelt; i++)
5632         sel[i] = nelt / 2 + i;
5633       indices.new_vector (sel, 2, nelt);
5634       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5635         {
5636           if (dump_enabled_p ())
5637             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5638                              "shift permutation is not supported by target\n");
5639           return false;
5640         }
5641       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5642
5643       /* Generating permutation constant to select vector from 2.
5644          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5645       for (i = 0; i < nelt / 2; i++)
5646         sel[i] = i;
5647       for (i = nelt / 2; i < nelt; i++)
5648         sel[i] = nelt + i;
5649       indices.new_vector (sel, 2, nelt);
5650       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5651         {
5652           if (dump_enabled_p ())
5653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654                              "select is not supported by target\n");
5655           return false;
5656         }
5657       select_mask = vect_gen_perm_mask_checked (vectype, indices);
5658
5659       for (i = 0; i < log_length; i++)
5660         {
5661           for (j = 0; j < length; j += 2)
5662             {
5663               first_vect = dr_chain[j];
5664               second_vect = dr_chain[j + 1];
5665
5666               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5667               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5668                                                first_vect, first_vect,
5669                                                perm2_mask1);
5670               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5671               vect[0] = data_ref;
5672
5673               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5674               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5675                                                second_vect, second_vect,
5676                                                perm2_mask2);
5677               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5678               vect[1] = data_ref;
5679
5680               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5681               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5682                                                vect[0], vect[1], shift1_mask);
5683               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5684               (*result_chain)[j/2 + length/2] = data_ref;
5685
5686               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5687               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5688                                                vect[0], vect[1], select_mask);
5689               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5690               (*result_chain)[j/2] = data_ref;
5691             }
5692           memcpy (dr_chain.address (), result_chain->address (),
5693                   length * sizeof (tree));
5694         }
5695       return true;
5696     }
5697   if (length == 3 && vf > 2)
5698     {
5699       unsigned int k = 0, l = 0;
5700
5701       /* Generating permutation constant to get all elements in rigth order.
5702          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5703       for (i = 0; i < nelt; i++)
5704         {
5705           if (3 * k + (l % 3) >= nelt)
5706             {
5707               k = 0;
5708               l += (3 - (nelt % 3));
5709             }
5710           sel[i] = 3 * k + (l % 3);
5711           k++;
5712         }
5713       vec_perm_indices indices (sel, 2, nelt);
5714       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5715         {
5716           if (dump_enabled_p ())
5717             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5718                              "shuffle of 3 fields structure is not \
5719                               supported by target\n");
5720           return false;
5721         }
5722       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5723
5724       /* Generating permutation constant to shift all elements.
5725          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5726       for (i = 0; i < nelt; i++)
5727         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5728       indices.new_vector (sel, 2, nelt);
5729       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5730         {
5731           if (dump_enabled_p ())
5732             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5733                              "shift permutation is not supported by target\n");
5734           return false;
5735         }
5736       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5737
5738       /* Generating permutation constant to shift all elements.
5739          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5740       for (i = 0; i < nelt; i++)
5741         sel[i] = 2 * (nelt / 3) + 1 + i;
5742       indices.new_vector (sel, 2, nelt);
5743       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5744         {
5745           if (dump_enabled_p ())
5746             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5747                              "shift permutation is not supported by target\n");
5748           return false;
5749         }
5750       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5751
5752       /* Generating permutation constant to shift all elements.
5753          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5754       for (i = 0; i < nelt; i++)
5755         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5756       indices.new_vector (sel, 2, nelt);
5757       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5758         {
5759           if (dump_enabled_p ())
5760             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5761                              "shift permutation is not supported by target\n");
5762           return false;
5763         }
5764       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5765
5766       /* Generating permutation constant to shift all elements.
5767          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5768       for (i = 0; i < nelt; i++)
5769         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5770       indices.new_vector (sel, 2, nelt);
5771       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5772         {
5773           if (dump_enabled_p ())
5774             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5775                              "shift permutation is not supported by target\n");
5776           return false;
5777         }
5778       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5779
5780       for (k = 0; k < 3; k++)
5781         {
5782           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5783           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5784                                            dr_chain[k], dr_chain[k],
5785                                            perm3_mask);
5786           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5787           vect[k] = data_ref;
5788         }
5789
5790       for (k = 0; k < 3; k++)
5791         {
5792           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5793           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5794                                            vect[k % 3], vect[(k + 1) % 3],
5795                                            shift1_mask);
5796           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5797           vect_shift[k] = data_ref;
5798         }
5799
5800       for (k = 0; k < 3; k++)
5801         {
5802           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5803           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5804                                            vect_shift[(4 - k) % 3],
5805                                            vect_shift[(3 - k) % 3],
5806                                            shift2_mask);
5807           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5808           vect[k] = data_ref;
5809         }
5810
5811       (*result_chain)[3 - (nelt % 3)] = vect[2];
5812
5813       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5814       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5815                                        vect[0], shift3_mask);
5816       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5817       (*result_chain)[nelt % 3] = data_ref;
5818
5819       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5820       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5821                                        vect[1], shift4_mask);
5822       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5823       (*result_chain)[0] = data_ref;
5824       return true;
5825     }
5826   return false;
5827 }
5828
5829 /* Function vect_transform_grouped_load.
5830
5831    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5832    to perform their permutation and ascribe the result vectorized statements to
5833    the scalar statements.
5834 */
5835
5836 void
5837 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5838                              gimple_stmt_iterator *gsi)
5839 {
5840   machine_mode mode;
5841   vec<tree> result_chain = vNULL;
5842
5843   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5844      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5845      vectors, that are ready for vector computation.  */
5846   result_chain.create (size);
5847
5848   /* If reassociation width for vector type is 2 or greater target machine can
5849      execute 2 or more vector instructions in parallel.  Otherwise try to
5850      get chain for loads group using vect_shift_permute_load_chain.  */
5851   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5852   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5853       || pow2p_hwi (size)
5854       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5855                                          gsi, &result_chain))
5856     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5857   vect_record_grouped_load_vectors (stmt, result_chain);
5858   result_chain.release ();
5859 }
5860
5861 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5862    generated as part of the vectorization of STMT.  Assign the statement
5863    for each vector to the associated scalar statement.  */
5864
5865 void
5866 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5867 {
5868   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5869   gimple *next_stmt, *new_stmt;
5870   unsigned int i, gap_count;
5871   tree tmp_data_ref;
5872
5873   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5874      Since we scan the chain starting from it's first node, their order
5875      corresponds the order of data-refs in RESULT_CHAIN.  */
5876   next_stmt = first_stmt;
5877   gap_count = 1;
5878   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5879     {
5880       if (!next_stmt)
5881         break;
5882
5883       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5884        code elimination pass later.  No need to check for the first stmt in
5885        the group, since it always exists.
5886        GROUP_GAP is the number of steps in elements from the previous
5887        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5888        correspond to the gaps.  */
5889       if (next_stmt != first_stmt
5890           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5891       {
5892         gap_count++;
5893         continue;
5894       }
5895
5896       while (next_stmt)
5897         {
5898           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5899           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5900              copies, and we put the new vector statement in the first available
5901              RELATED_STMT.  */
5902           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5903             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5904           else
5905             {
5906               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5907                 {
5908                   gimple *prev_stmt =
5909                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5910                   gimple *rel_stmt =
5911                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5912                   while (rel_stmt)
5913                     {
5914                       prev_stmt = rel_stmt;
5915                       rel_stmt =
5916                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5917                     }
5918
5919                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5920                     new_stmt;
5921                 }
5922             }
5923
5924           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5925           gap_count = 1;
5926           /* If NEXT_STMT accesses the same DR as the previous statement,
5927              put the same TMP_DATA_REF as its vectorized statement; otherwise
5928              get the next data-ref from RESULT_CHAIN.  */
5929           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5930             break;
5931         }
5932     }
5933 }
5934
5935 /* Function vect_force_dr_alignment_p.
5936
5937    Returns whether the alignment of a DECL can be forced to be aligned
5938    on ALIGNMENT bit boundary.  */
5939
5940 bool
5941 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5942 {
5943   if (!VAR_P (decl))
5944     return false;
5945
5946   if (decl_in_symtab_p (decl)
5947       && !symtab_node::get (decl)->can_increase_alignment_p ())
5948     return false;
5949
5950   if (TREE_STATIC (decl))
5951     return (alignment <= MAX_OFILE_ALIGNMENT);
5952   else
5953     return (alignment <= MAX_STACK_ALIGNMENT);
5954 }
5955
5956
5957 /* Return whether the data reference DR is supported with respect to its
5958    alignment.
5959    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5960    it is aligned, i.e., check if it is possible to vectorize it with different
5961    alignment.  */
5962
5963 enum dr_alignment_support
5964 vect_supportable_dr_alignment (struct data_reference *dr,
5965                                bool check_aligned_accesses)
5966 {
5967   gimple *stmt = DR_STMT (dr);
5968   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5969   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5970   machine_mode mode = TYPE_MODE (vectype);
5971   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5972   struct loop *vect_loop = NULL;
5973   bool nested_in_vect_loop = false;
5974
5975   if (aligned_access_p (dr) && !check_aligned_accesses)
5976     return dr_aligned;
5977
5978   /* For now assume all conditional loads/stores support unaligned
5979      access without any special code.  */
5980   if (is_gimple_call (stmt)
5981       && gimple_call_internal_p (stmt)
5982       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5983           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5984     return dr_unaligned_supported;
5985
5986   if (loop_vinfo)
5987     {
5988       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5989       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5990     }
5991
5992   /* Possibly unaligned access.  */
5993
5994   /* We can choose between using the implicit realignment scheme (generating
5995      a misaligned_move stmt) and the explicit realignment scheme (generating
5996      aligned loads with a REALIGN_LOAD).  There are two variants to the
5997      explicit realignment scheme: optimized, and unoptimized.
5998      We can optimize the realignment only if the step between consecutive
5999      vector loads is equal to the vector size.  Since the vector memory
6000      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6001      is guaranteed that the misalignment amount remains the same throughout the
6002      execution of the vectorized loop.  Therefore, we can create the
6003      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6004      at the loop preheader.
6005
6006      However, in the case of outer-loop vectorization, when vectorizing a
6007      memory access in the inner-loop nested within the LOOP that is now being
6008      vectorized, while it is guaranteed that the misalignment of the
6009      vectorized memory access will remain the same in different outer-loop
6010      iterations, it is *not* guaranteed that is will remain the same throughout
6011      the execution of the inner-loop.  This is because the inner-loop advances
6012      with the original scalar step (and not in steps of VS).  If the inner-loop
6013      step happens to be a multiple of VS, then the misalignment remains fixed
6014      and we can use the optimized realignment scheme.  For example:
6015
6016       for (i=0; i<N; i++)
6017         for (j=0; j<M; j++)
6018           s += a[i+j];
6019
6020      When vectorizing the i-loop in the above example, the step between
6021      consecutive vector loads is 1, and so the misalignment does not remain
6022      fixed across the execution of the inner-loop, and the realignment cannot
6023      be optimized (as illustrated in the following pseudo vectorized loop):
6024
6025       for (i=0; i<N; i+=4)
6026         for (j=0; j<M; j++){
6027           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6028                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6029                          // (assuming that we start from an aligned address).
6030           }
6031
6032      We therefore have to use the unoptimized realignment scheme:
6033
6034       for (i=0; i<N; i+=4)
6035           for (j=k; j<M; j+=4)
6036           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6037                            // that the misalignment of the initial address is
6038                            // 0).
6039
6040      The loop can then be vectorized as follows:
6041
6042       for (k=0; k<4; k++){
6043         rt = get_realignment_token (&vp[k]);
6044         for (i=0; i<N; i+=4){
6045           v1 = vp[i+k];
6046           for (j=k; j<M; j+=4){
6047             v2 = vp[i+j+VS-1];
6048             va = REALIGN_LOAD <v1,v2,rt>;
6049             vs += va;
6050             v1 = v2;
6051           }
6052         }
6053     } */
6054
6055   if (DR_IS_READ (dr))
6056     {
6057       bool is_packed = false;
6058       tree type = (TREE_TYPE (DR_REF (dr)));
6059
6060       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6061           && (!targetm.vectorize.builtin_mask_for_load
6062               || targetm.vectorize.builtin_mask_for_load ()))
6063         {
6064           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6065
6066           /* If we are doing SLP then the accesses need not have the
6067              same alignment, instead it depends on the SLP group size.  */
6068           if (loop_vinfo
6069               && STMT_SLP_TYPE (stmt_info)
6070               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6071                               * GROUP_SIZE (vinfo_for_stmt
6072                                             (GROUP_FIRST_ELEMENT (stmt_info))),
6073                               TYPE_VECTOR_SUBPARTS (vectype)))
6074             ;
6075           else if (!loop_vinfo
6076                    || (nested_in_vect_loop
6077                        && (TREE_INT_CST_LOW (DR_STEP (dr))
6078                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6079             return dr_explicit_realign;
6080           else
6081             return dr_explicit_realign_optimized;
6082         }
6083       if (!known_alignment_for_access_p (dr))
6084         is_packed = not_size_aligned (DR_REF (dr));
6085
6086       if (targetm.vectorize.support_vector_misalignment
6087             (mode, type, DR_MISALIGNMENT (dr), is_packed))
6088         /* Can't software pipeline the loads, but can at least do them.  */
6089         return dr_unaligned_supported;
6090     }
6091   else
6092     {
6093       bool is_packed = false;
6094       tree type = (TREE_TYPE (DR_REF (dr)));
6095
6096       if (!known_alignment_for_access_p (dr))
6097         is_packed = not_size_aligned (DR_REF (dr));
6098
6099      if (targetm.vectorize.support_vector_misalignment
6100            (mode, type, DR_MISALIGNMENT (dr), is_packed))
6101        return dr_unaligned_supported;
6102     }
6103
6104   /* Unsupported.  */
6105   return dr_unaligned_unsupported;
6106 }