gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55
  56 /* Return true if load- or store-lanes optab OPTAB is implemented for
  57    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  58
  59 static bool
  60 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  61                               tree vectype, unsigned HOST_WIDE_INT count)
  62 {
  63   machine_mode mode;
  64   scalar_int_mode array_mode;
  65   bool limit_p;
  66
  67   mode = TYPE_MODE (vectype);
  68   limit_p = !targetm.array_mode_supported_p (mode, count);
  69   if (!int_mode_for_size (count * GET_MODE_BITSIZE (mode),
  70                           limit_p).exists (&array_mode))
  71     {
  72       if (dump_enabled_p ())
  73         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  74                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  75                          GET_MODE_NAME (mode), count);
  76       return false;
  77     }
  78
  79   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  80     {
  81       if (dump_enabled_p ())
  82         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  83                          "cannot use %s<%s><%s>\n", name,
  84                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  85       return false;
  86     }
  87
  88   if (dump_enabled_p ())
  89     dump_printf_loc (MSG_NOTE, vect_location,
  90                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  91                      GET_MODE_NAME (mode));
  92
  93   return true;
  94 }
  95
  96
  97 /* Return the smallest scalar part of STMT.
  98    This is used to determine the vectype of the stmt.  We generally set the
  99    vectype according to the type of the result (lhs).  For stmts whose
 100    result-type is different than the type of the arguments (e.g., demotion,
 101    promotion), vectype will be reset appropriately (later).  Note that we have
 102    to visit the smallest datatype in this function, because that determines the
 103    VF.  If the smallest datatype in the loop is present only as the rhs of a
 104    promotion operation - we'd miss it.
 105    Such a case, where a variable of this datatype does not appear in the lhs
 106    anywhere in the loop, can only occur if it's an invariant: e.g.:
 107    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 108    invariant motion.  However, we cannot rely on invariant motion to always
 109    take invariants out of the loop, and so in the case of promotion we also
 110    have to check the rhs.
 111    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 112    types.  */
 113
 114 tree
 115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 116                                HOST_WIDE_INT *rhs_size_unit)
 117 {
 118   tree scalar_type = gimple_expr_type (stmt);
 119   HOST_WIDE_INT lhs, rhs;
 120
 121   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 122
 123   if (is_gimple_assign (stmt)
 124       && (gimple_assign_cast_p (stmt)
 125           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 126           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 127           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 128     {
 129       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 130
 131       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 132       if (rhs < lhs)
 133         scalar_type = rhs_type;
 134     }
 135
 136   *lhs_size_unit = lhs;
 137   *rhs_size_unit = rhs;
 138   return scalar_type;
 139 }
 140
 141
 142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 143    tested at run-time.  Return TRUE if DDR was successfully inserted.
 144    Return false if versioning is not supported.  */
 145
 146 static bool
 147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 148 {
 149   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 150
 151   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 152     return false;
 153
 154   if (!runtime_alias_check_p (ddr, loop,
 155                               optimize_loop_nest_for_speed_p (loop)))
 156     return false;
 157
 158   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 159   return true;
 160 }
 161
 162
 163 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 164    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 165    distances.  These distances are conservatively correct but they don't
 166    reflect a guaranteed dependence.
 167
 168    Return true if this function does all the work necessary to avoid
 169    an alias or false if the caller should use the dependence distances
 170    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 171    the depth of the loop described by LOOP_VINFO and the other arguments
 172    are as for vect_analyze_data_ref_dependence.  */
 173
 174 static bool
 175 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 176                                        loop_vec_info loop_vinfo,
 177                                        int loop_depth, int *max_vf)
 178 {
 179   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 180   lambda_vector dist_v;
 181   unsigned int i;
 182   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 183     {
 184       int dist = dist_v[loop_depth];
 185       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 186         {
 187           /* If the user asserted safelen >= DIST consecutive iterations
 188              can be executed concurrently, assume independence.
 189
 190              ??? An alternative would be to add the alias check even
 191              in this case, and vectorize the fallback loop with the
 192              maximum VF set to safelen.  However, if the user has
 193              explicitly given a length, it's less likely that that
 194              would be a win.  */
 195           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 196             {
 197               if (loop->safelen < *max_vf)
 198                 *max_vf = loop->safelen;
 199               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 200               continue;
 201             }
 202
 203           /* For dependence distances of 2 or more, we have the option
 204              of limiting VF or checking for an alias at runtime.
 205              Prefer to check at runtime if we can, to avoid limiting
 206              the VF unnecessarily when the bases are in fact independent.
 207
 208              Note that the alias checks will be removed if the VF ends up
 209              being small enough.  */
 210           return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 211         }
 212     }
 213   return true;
 214 }
 215
 216
 217 /* Function vect_analyze_data_ref_dependence.
 218
 219    Return TRUE if there (might) exist a dependence between a memory-reference
 220    DRA and a memory-reference DRB.  When versioning for alias may check a
 221    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 222    the data dependence.  */
 223
 224 static bool
 225 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 226                                   loop_vec_info loop_vinfo, int *max_vf)
 227 {
 228   unsigned int i;
 229   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 230   struct data_reference *dra = DDR_A (ddr);
 231   struct data_reference *drb = DDR_B (ddr);
 232   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 233   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 234   lambda_vector dist_v;
 235   unsigned int loop_depth;
 236
 237   /* In loop analysis all data references should be vectorizable.  */
 238   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 239       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 240     gcc_unreachable ();
 241
 242   /* Independent data accesses.  */
 243   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 244     return false;
 245
 246   if (dra == drb
 247       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 248     return false;
 249
 250   /* We do not have to consider dependences between accesses that belong
 251      to the same group.  */
 252   if (GROUP_FIRST_ELEMENT (stmtinfo_a)
 253       && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
 254     return false;
 255
 256   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 257      least two scalar iterations, there is always also a true dependence.
 258      As the vectorizer does not re-order loads and stores we can ignore
 259      the anti-dependence if TBAA can disambiguate both DRs similar to the
 260      case with known negative distance anti-dependences (positive
 261      distance anti-dependences would violate TBAA constraints).  */
 262   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 263        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 264       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 265                                  get_alias_set (DR_REF (drb))))
 266     return false;
 267
 268   /* Unknown data dependence.  */
 269   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 270     {
 271       /* If user asserted safelen consecutive iterations can be
 272          executed concurrently, assume independence.  */
 273       if (loop->safelen >= 2)
 274         {
 275           if (loop->safelen < *max_vf)
 276             *max_vf = loop->safelen;
 277           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 278           return false;
 279         }
 280
 281       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 282           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 283         {
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 287                                "versioning for alias not supported for: "
 288                                "can't determine dependence between ");
 289               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 290                                  DR_REF (dra));
 291               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 292               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 293                                  DR_REF (drb));
 294               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 295             }
 296           return true;
 297         }
 298
 299       if (dump_enabled_p ())
 300         {
 301           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 302                            "versioning for alias required: "
 303                            "can't determine dependence between ");
 304           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 305                              DR_REF (dra));
 306           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 307           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 308                              DR_REF (drb));
 309           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 310         }
 311
 312       /* Add to list of ddrs that need to be tested at run-time.  */
 313       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 314     }
 315
 316   /* Known data dependence.  */
 317   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 318     {
 319       /* If user asserted safelen consecutive iterations can be
 320          executed concurrently, assume independence.  */
 321       if (loop->safelen >= 2)
 322         {
 323           if (loop->safelen < *max_vf)
 324             *max_vf = loop->safelen;
 325           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 326           return false;
 327         }
 328
 329       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 330           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 331         {
 332           if (dump_enabled_p ())
 333             {
 334               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 335                                "versioning for alias not supported for: "
 336                                "bad dist vector for ");
 337               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 338                                  DR_REF (dra));
 339               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 340               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                  DR_REF (drb));
 342               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343             }
 344           return true;
 345         }
 346
 347       if (dump_enabled_p ())
 348         {
 349           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 350                            "versioning for alias required: "
 351                            "bad dist vector for ");
 352           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 353           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 354           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 355           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 356         }
 357       /* Add to list of ddrs that need to be tested at run-time.  */
 358       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 359     }
 360
 361   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 362
 363   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 364       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 365                                                 loop_depth, max_vf))
 366     return false;
 367
 368   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 369     {
 370       int dist = dist_v[loop_depth];
 371
 372       if (dump_enabled_p ())
 373         dump_printf_loc (MSG_NOTE, vect_location,
 374                          "dependence distance  = %d.\n", dist);
 375
 376       if (dist == 0)
 377         {
 378           if (dump_enabled_p ())
 379             {
 380               dump_printf_loc (MSG_NOTE, vect_location,
 381                                "dependence distance == 0 between ");
 382               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 383               dump_printf (MSG_NOTE, " and ");
 384               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 385               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 386             }
 387
 388           /* When we perform grouped accesses and perform implicit CSE
 389              by detecting equal accesses and doing disambiguation with
 390              runtime alias tests like for
 391                 .. = a[i];
 392                 .. = a[i+1];
 393                 a[i] = ..;
 394                 a[i+1] = ..;
 395                 *p = ..;
 396                 .. = a[i];
 397                 .. = a[i+1];
 398              where we will end up loading { a[i], a[i+1] } once, make
 399              sure that inserting group loads before the first load and
 400              stores after the last store will do the right thing.
 401              Similar for groups like
 402                 a[i] = ...;
 403                 ... = a[i];
 404                 a[i+1] = ...;
 405              where loads from the group interleave with the store.  */
 406           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 407               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 408             {
 409               gimple *earlier_stmt;
 410               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 411               if (DR_IS_WRITE
 412                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 413                 {
 414                   if (dump_enabled_p ())
 415                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 416                                      "READ_WRITE dependence in interleaving."
 417                                      "\n");
 418                   return true;
 419                 }
 420             }
 421
 422           continue;
 423         }
 424
 425       if (dist > 0 && DDR_REVERSED_P (ddr))
 426         {
 427           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 428              reversed (to make distance vector positive), and the actual
 429              distance is negative.  */
 430           if (dump_enabled_p ())
 431             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 432                              "dependence distance negative.\n");
 433           /* Record a negative dependence distance to later limit the
 434              amount of stmt copying / unrolling we can perform.
 435              Only need to handle read-after-write dependence.  */
 436           if (DR_IS_READ (drb)
 437               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 438                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 439             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 440           continue;
 441         }
 442
 443       if (abs (dist) >= 2
 444           && abs (dist) < *max_vf)
 445         {
 446           /* The dependence distance requires reduction of the maximal
 447              vectorization factor.  */
 448           *max_vf = abs (dist);
 449           if (dump_enabled_p ())
 450             dump_printf_loc (MSG_NOTE, vect_location,
 451                              "adjusting maximal vectorization factor to %i\n",
 452                              *max_vf);
 453         }
 454
 455       if (abs (dist) >= *max_vf)
 456         {
 457           /* Dependence distance does not create dependence, as far as
 458              vectorization is concerned, in this case.  */
 459           if (dump_enabled_p ())
 460             dump_printf_loc (MSG_NOTE, vect_location,
 461                              "dependence distance >= VF.\n");
 462           continue;
 463         }
 464
 465       if (dump_enabled_p ())
 466         {
 467           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 468                        "not vectorized, possible dependence "
 469                        "between data-refs ");
 470           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 471           dump_printf (MSG_NOTE,  " and ");
 472           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 473           dump_printf (MSG_NOTE,  "\n");
 474         }
 475
 476       return true;
 477     }
 478
 479   return false;
 480 }
 481
 482 /* Function vect_analyze_data_ref_dependences.
 483
 484    Examine all the data references in the loop, and make sure there do not
 485    exist any data dependences between them.  Set *MAX_VF according to
 486    the maximum vectorization factor the data dependences allow.  */
 487
 488 bool
 489 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 490 {
 491   unsigned int i;
 492   struct data_dependence_relation *ddr;
 493
 494   if (dump_enabled_p ())
 495     dump_printf_loc (MSG_NOTE, vect_location,
 496                      "=== vect_analyze_data_ref_dependences ===\n");
 497
 498   LOOP_VINFO_DDRS (loop_vinfo)
 499     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 500              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 501   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 502   /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS.  */
 503   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 504                                 &LOOP_VINFO_DDRS (loop_vinfo),
 505                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 506     return false;
 507
 508   /* For epilogues we either have no aliases or alias versioning
 509      was applied to original loop.  Therefore we may just get max_vf
 510      using VF of original loop.  */
 511   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 512     *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
 513   else
 514     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 515       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 516         return false;
 517
 518   return true;
 519 }
 520
 521
 522 /* Function vect_slp_analyze_data_ref_dependence.
 523
 524    Return TRUE if there (might) exist a dependence between a memory-reference
 525    DRA and a memory-reference DRB.  When versioning for alias may check a
 526    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 527    the data dependence.  */
 528
 529 static bool
 530 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 531 {
 532   struct data_reference *dra = DDR_A (ddr);
 533   struct data_reference *drb = DDR_B (ddr);
 534
 535   /* We need to check dependences of statements marked as unvectorizable
 536      as well, they still can prohibit vectorization.  */
 537
 538   /* Independent data accesses.  */
 539   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 540     return false;
 541
 542   if (dra == drb)
 543     return false;
 544
 545   /* Read-read is OK.  */
 546   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 547     return false;
 548
 549   /* If dra and drb are part of the same interleaving chain consider
 550      them independent.  */
 551   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 552       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 553           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 554     return false;
 555
 556   /* Unknown data dependence.  */
 557   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 558     {
 559       if  (dump_enabled_p ())
 560         {
 561           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 562                            "can't determine dependence between ");
 563           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 564           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 565           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 566           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 567         }
 568     }
 569   else if (dump_enabled_p ())
 570     {
 571       dump_printf_loc (MSG_NOTE, vect_location,
 572                        "determined dependence between ");
 573       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 574       dump_printf (MSG_NOTE, " and ");
 575       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 576       dump_printf (MSG_NOTE,  "\n");
 577     }
 578
 579   return true;
 580 }
 581
 582
 583 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 584    contain the vector of scalar stores of this instance if we are
 585    disambiguating the loads.  */
 586
 587 static bool
 588 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 589                                    vec<gimple *> stores, gimple *last_store)
 590 {
 591   /* This walks over all stmts involved in the SLP load/store done
 592      in NODE verifying we can sink them up to the last stmt in the
 593      group.  */
 594   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 595   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 596     {
 597       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 598       if (access == last_access)
 599         continue;
 600       data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
 601       for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
 602            gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 603         {
 604           gimple *stmt = gsi_stmt (gsi);
 605           if (! gimple_vuse (stmt)
 606               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 607             continue;
 608
 609           /* If we couldn't record a (single) data reference for this
 610              stmt we have to give up.  */
 611           /* ???  Here and below if dependence analysis fails we can resort
 612              to the alias oracle which can handle more kinds of stmts.  */
 613           data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
 614           if (!dr_b)
 615             return false;
 616
 617           bool dependent = false;
 618           /* If we run into a store of this same instance (we've just
 619              marked those) then delay dependence checking until we run
 620              into the last store because this is where it will have
 621              been sunk to (and we verify if we can do that as well).  */
 622           if (gimple_visited_p (stmt))
 623             {
 624               if (stmt != last_store)
 625                 continue;
 626               unsigned i;
 627               gimple *store;
 628               FOR_EACH_VEC_ELT (stores, i, store)
 629                 {
 630                   data_reference *store_dr
 631                     = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
 632                   ddr_p ddr = initialize_data_dependence_relation
 633                                 (dr_a, store_dr, vNULL);
 634                   dependent = vect_slp_analyze_data_ref_dependence (ddr);
 635                   free_dependence_relation (ddr);
 636                   if (dependent)
 637                     break;
 638                 }
 639             }
 640           else
 641             {
 642               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 643                                                                dr_b, vNULL);
 644               dependent = vect_slp_analyze_data_ref_dependence (ddr);
 645               free_dependence_relation (ddr);
 646             }
 647           if (dependent)
 648             return false;
 649         }
 650     }
 651   return true;
 652 }
 653
 654
 655 /* Function vect_analyze_data_ref_dependences.
 656
 657    Examine all the data references in the basic-block, and make sure there
 658    do not exist any data dependences between them.  Set *MAX_VF according to
 659    the maximum vectorization factor the data dependences allow.  */
 660
 661 bool
 662 vect_slp_analyze_instance_dependence (slp_instance instance)
 663 {
 664   if (dump_enabled_p ())
 665     dump_printf_loc (MSG_NOTE, vect_location,
 666                      "=== vect_slp_analyze_instance_dependence ===\n");
 667
 668   /* The stores of this instance are at the root of the SLP tree.  */
 669   slp_tree store = SLP_INSTANCE_TREE (instance);
 670   if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
 671     store = NULL;
 672
 673   /* Verify we can sink stores to the vectorized stmt insert location.  */
 674   gimple *last_store = NULL;
 675   if (store)
 676     {
 677       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 678         return false;
 679
 680       /* Mark stores in this instance and remember the last one.  */
 681       last_store = vect_find_last_scalar_stmt_in_slp (store);
 682       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 683         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
 684     }
 685
 686   bool res = true;
 687
 688   /* Verify we can sink loads to the vectorized stmt insert location,
 689      special-casing stores of this instance.  */
 690   slp_tree load;
 691   unsigned int i;
 692   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 693     if (! vect_slp_analyze_node_dependences (instance, load,
 694                                              store
 695                                              ? SLP_TREE_SCALAR_STMTS (store)
 696                                              : vNULL, last_store))
 697       {
 698         res = false;
 699         break;
 700       }
 701
 702   /* Unset the visited flag.  */
 703   if (store)
 704     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 705       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
 706
 707   return res;
 708 }
 709
 710 /* Record in VINFO the base alignment guarantee given by DRB.  STMT is
 711    the statement that contains DRB, which is useful for recording in the
 712    dump file.  */
 713
 714 static void
 715 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
 716                             innermost_loop_behavior *drb)
 717 {
 718   bool existed;
 719   innermost_loop_behavior *&entry
 720     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 721   if (!existed || entry->base_alignment < drb->base_alignment)
 722     {
 723       entry = drb;
 724       if (dump_enabled_p ())
 725         {
 726           dump_printf_loc (MSG_NOTE, vect_location,
 727                            "recording new base alignment for ");
 728           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 729           dump_printf (MSG_NOTE, "\n");
 730           dump_printf_loc (MSG_NOTE, vect_location,
 731                            "  alignment:    %d\n", drb->base_alignment);
 732           dump_printf_loc (MSG_NOTE, vect_location,
 733                            "  misalignment: %d\n", drb->base_misalignment);
 734           dump_printf_loc (MSG_NOTE, vect_location,
 735                            "  based on:     ");
 736           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 737         }
 738     }
 739 }
 740
 741 /* If the region we're going to vectorize is reached, all unconditional
 742    data references occur at least once.  We can therefore pool the base
 743    alignment guarantees from each unconditional reference.  Do this by
 744    going through all the data references in VINFO and checking whether
 745    the containing statement makes the reference unconditionally.  If so,
 746    record the alignment of the base address in VINFO so that it can be
 747    used for all other references with the same base.  */
 748
 749 void
 750 vect_record_base_alignments (vec_info *vinfo)
 751 {
 752   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 753   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 754   data_reference *dr;
 755   unsigned int i;
 756   FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
 757     if (!DR_IS_CONDITIONAL_IN_STMT (dr))
 758       {
 759         gimple *stmt = DR_STMT (dr);
 760         vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
 761
 762         /* If DR is nested in the loop that is being vectorized, we can also
 763            record the alignment of the base wrt the outer loop.  */
 764         if (loop && nested_in_vect_loop_p (loop, stmt))
 765           {
 766             stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 767             vect_record_base_alignment
 768               (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 769           }
 770       }
 771 }
 772
 773 /* Function vect_compute_data_ref_alignment
 774
 775    Compute the misalignment of the data reference DR.
 776
 777    Output:
 778    1. If during the misalignment computation it is found that the data reference
 779       cannot be vectorized then false is returned.
 780    2. DR_MISALIGNMENT (DR) is defined.
 781
 782    FOR NOW: No analysis is actually performed. Misalignment is calculated
 783    only for trivial cases. TODO.  */
 784
 785 bool
 786 vect_compute_data_ref_alignment (struct data_reference *dr)
 787 {
 788   gimple *stmt = DR_STMT (dr);
 789   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 790   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 791   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 792   struct loop *loop = NULL;
 793   tree ref = DR_REF (dr);
 794   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 795
 796   if (dump_enabled_p ())
 797     dump_printf_loc (MSG_NOTE, vect_location,
 798                      "vect_compute_data_ref_alignment:\n");
 799
 800   if (loop_vinfo)
 801     loop = LOOP_VINFO_LOOP (loop_vinfo);
 802
 803   /* Initialize misalignment to unknown.  */
 804   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
 805
 806   innermost_loop_behavior *drb = vect_dr_behavior (dr);
 807   bool step_preserves_misalignment_p;
 808
 809   /* No step for BB vectorization.  */
 810   if (!loop)
 811     {
 812       gcc_assert (integer_zerop (drb->step));
 813       step_preserves_misalignment_p = true;
 814     }
 815
 816   /* In case the dataref is in an inner-loop of the loop that is being
 817      vectorized (LOOP), we use the base and misalignment information
 818      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 819      stays the same throughout the execution of the inner-loop, which is why
 820      we have to check that the stride of the dataref in the inner-loop evenly
 821      divides by the vector size.  */
 822   else if (nested_in_vect_loop_p (loop, stmt))
 823     {
 824       step_preserves_misalignment_p
 825         = (DR_STEP_ALIGNMENT (dr)
 826            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 827
 828       if (dump_enabled_p ())
 829         {
 830           if (step_preserves_misalignment_p)
 831             dump_printf_loc (MSG_NOTE, vect_location,
 832                              "inner step divides the vector-size.\n");
 833           else
 834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 835                              "inner step doesn't divide the vector-size.\n");
 836         }
 837     }
 838
 839   /* Similarly we can only use base and misalignment information relative to
 840      an innermost loop if the misalignment stays the same throughout the
 841      execution of the loop.  As above, this is the case if the stride of
 842      the dataref evenly divides by the vector size.  */
 843   else
 844     {
 845       unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 846       step_preserves_misalignment_p
 847         = ((DR_STEP_ALIGNMENT (dr) * vf)
 848            % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
 849
 850       if (!step_preserves_misalignment_p && dump_enabled_p ())
 851         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 852                          "step doesn't divide the vector-size.\n");
 853     }
 854
 855   unsigned int base_alignment = drb->base_alignment;
 856   unsigned int base_misalignment = drb->base_misalignment;
 857   unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
 858
 859   /* Calculate the maximum of the pooled base address alignment and the
 860      alignment that we can compute for DR itself.  */
 861   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 862   if (entry && base_alignment < (*entry)->base_alignment)
 863     {
 864       base_alignment = (*entry)->base_alignment;
 865       base_misalignment = (*entry)->base_misalignment;
 866     }
 867
 868   if (drb->offset_alignment < vector_alignment
 869       || !step_preserves_misalignment_p
 870       /* We need to know whether the step wrt the vectorized loop is
 871          negative when computing the starting misalignment below.  */
 872       || TREE_CODE (drb->step) != INTEGER_CST)
 873     {
 874       if (dump_enabled_p ())
 875         {
 876           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 877                            "Unknown alignment for access: ");
 878           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 879           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 880         }
 881       return true;
 882     }
 883
 884   if (base_alignment < vector_alignment)
 885     {
 886       tree base = drb->base_address;
 887       if (TREE_CODE (base) == ADDR_EXPR)
 888         base = TREE_OPERAND (base, 0);
 889       if (!vect_can_force_dr_alignment_p (base,
 890                                           vector_alignment * BITS_PER_UNIT))
 891         {
 892           if (dump_enabled_p ())
 893             {
 894               dump_printf_loc (MSG_NOTE, vect_location,
 895                                "can't force alignment of ref: ");
 896               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 897               dump_printf (MSG_NOTE, "\n");
 898             }
 899           return true;
 900         }
 901
 902       if (DECL_USER_ALIGN (base))
 903         {
 904           if (dump_enabled_p ())
 905             {
 906               dump_printf_loc (MSG_NOTE, vect_location,
 907                                "not forcing alignment of user-aligned "
 908                                "variable: ");
 909               dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
 910               dump_printf (MSG_NOTE, "\n");
 911             }
 912           return true;
 913         }
 914
 915       /* Force the alignment of the decl.
 916          NOTE: This is the only change to the code we make during
 917          the analysis phase, before deciding to vectorize the loop.  */
 918       if (dump_enabled_p ())
 919         {
 920           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 921           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 922           dump_printf (MSG_NOTE, "\n");
 923         }
 924
 925       DR_VECT_AUX (dr)->base_decl = base;
 926       DR_VECT_AUX (dr)->base_misaligned = true;
 927       base_misalignment = 0;
 928     }
 929   unsigned int misalignment = (base_misalignment
 930                                + TREE_INT_CST_LOW (drb->init));
 931
 932   /* If this is a backward running DR then first access in the larger
 933      vectype actually is N-1 elements before the address in the DR.
 934      Adjust misalign accordingly.  */
 935   if (tree_int_cst_sgn (drb->step) < 0)
 936     /* PLUS because STEP is negative.  */
 937     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
 938                      * TREE_INT_CST_LOW (drb->step));
 939
 940   SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
 941
 942   if (dump_enabled_p ())
 943     {
 944       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 945                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 946       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 947       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 948     }
 949
 950   return true;
 951 }
 952
 953
 954 /* Function vect_update_misalignment_for_peel.
 955    Sets DR's misalignment
 956    - to 0 if it has the same alignment as DR_PEEL,
 957    - to the misalignment computed using NPEEL if DR's salignment is known,
 958    - to -1 (unknown) otherwise.
 959
 960    DR - the data reference whose misalignment is to be adjusted.
 961    DR_PEEL - the data reference whose misalignment is being made
 962              zero in the vector loop by the peel.
 963    NPEEL - the number of iterations in the peel loop if the misalignment
 964            of DR_PEEL is known at compile time.  */
 965
 966 static void
 967 vect_update_misalignment_for_peel (struct data_reference *dr,
 968                                    struct data_reference *dr_peel, int npeel)
 969 {
 970   unsigned int i;
 971   vec<dr_p> same_aligned_drs;
 972   struct data_reference *current_dr;
 973   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 974   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 975   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 976   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 977
 978  /* For interleaved data accesses the step in the loop must be multiplied by
 979      the size of the interleaving group.  */
 980   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 981     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 982   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 983     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 984
 985   /* It can be assumed that the data refs with the same alignment as dr_peel
 986      are aligned in the vector loop.  */
 987   same_aligned_drs
 988     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 989   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
 990     {
 991       if (current_dr != dr)
 992         continue;
 993       gcc_assert (!known_alignment_for_access_p (dr)
 994                   || !known_alignment_for_access_p (dr_peel)
 995                   || (DR_MISALIGNMENT (dr) / dr_size
 996                       == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
 997       SET_DR_MISALIGNMENT (dr, 0);
 998       return;
 999     }
1000
1001   if (known_alignment_for_access_p (dr)
1002       && known_alignment_for_access_p (dr_peel))
1003     {
1004       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1005       int misal = DR_MISALIGNMENT (dr);
1006       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1007       misal += negative ? -npeel * dr_size : npeel * dr_size;
1008       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1009       SET_DR_MISALIGNMENT (dr, misal);
1010       return;
1011     }
1012
1013   if (dump_enabled_p ())
1014     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1015                      "to unknown (-1).\n");
1016   SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1017 }
1018
1019
1020 /* Function verify_data_ref_alignment
1021
1022    Return TRUE if DR can be handled with respect to alignment.  */
1023
1024 static bool
1025 verify_data_ref_alignment (data_reference_p dr)
1026 {
1027   enum dr_alignment_support supportable_dr_alignment
1028     = vect_supportable_dr_alignment (dr, false);
1029   if (!supportable_dr_alignment)
1030     {
1031       if (dump_enabled_p ())
1032         {
1033           if (DR_IS_READ (dr))
1034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                              "not vectorized: unsupported unaligned load.");
1036           else
1037             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038                              "not vectorized: unsupported unaligned "
1039                              "store.");
1040
1041           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1042                              DR_REF (dr));
1043           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1044         }
1045       return false;
1046     }
1047
1048   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1049     dump_printf_loc (MSG_NOTE, vect_location,
1050                      "Vectorizing an unaligned access.\n");
1051
1052   return true;
1053 }
1054
1055 /* Function vect_verify_datarefs_alignment
1056
1057    Return TRUE if all data references in the loop can be
1058    handled with respect to alignment.  */
1059
1060 bool
1061 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1062 {
1063   vec<data_reference_p> datarefs = vinfo->datarefs;
1064   struct data_reference *dr;
1065   unsigned int i;
1066
1067   FOR_EACH_VEC_ELT (datarefs, i, dr)
1068     {
1069       gimple *stmt = DR_STMT (dr);
1070       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1071
1072       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1073         continue;
1074
1075       /* For interleaving, only the alignment of the first access matters.   */
1076       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1077           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1078         continue;
1079
1080       /* Strided accesses perform only component accesses, alignment is
1081          irrelevant for them.  */
1082       if (STMT_VINFO_STRIDED_P (stmt_info)
1083           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1084         continue;
1085
1086       if (! verify_data_ref_alignment (dr))
1087         return false;
1088     }
1089
1090   return true;
1091 }
1092
1093 /* Given an memory reference EXP return whether its alignment is less
1094    than its size.  */
1095
1096 static bool
1097 not_size_aligned (tree exp)
1098 {
1099   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1100     return true;
1101
1102   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1103           > get_object_alignment (exp));
1104 }
1105
1106 /* Function vector_alignment_reachable_p
1107
1108    Return true if vector alignment for DR is reachable by peeling
1109    a few loop iterations.  Return false otherwise.  */
1110
1111 static bool
1112 vector_alignment_reachable_p (struct data_reference *dr)
1113 {
1114   gimple *stmt = DR_STMT (dr);
1115   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1116   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1117
1118   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1119     {
1120       /* For interleaved access we peel only if number of iterations in
1121          the prolog loop ({VF - misalignment}), is a multiple of the
1122          number of the interleaved accesses.  */
1123       int elem_size, mis_in_elements;
1124       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1125
1126       /* FORNOW: handle only known alignment.  */
1127       if (!known_alignment_for_access_p (dr))
1128         return false;
1129
1130       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1131       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1132
1133       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1134         return false;
1135     }
1136
1137   /* If misalignment is known at the compile time then allow peeling
1138      only if natural alignment is reachable through peeling.  */
1139   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1140     {
1141       HOST_WIDE_INT elmsize =
1142                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1143       if (dump_enabled_p ())
1144         {
1145           dump_printf_loc (MSG_NOTE, vect_location,
1146                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1147           dump_printf (MSG_NOTE,
1148                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1149         }
1150       if (DR_MISALIGNMENT (dr) % elmsize)
1151         {
1152           if (dump_enabled_p ())
1153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154                              "data size does not divide the misalignment.\n");
1155           return false;
1156         }
1157     }
1158
1159   if (!known_alignment_for_access_p (dr))
1160     {
1161       tree type = TREE_TYPE (DR_REF (dr));
1162       bool is_packed = not_size_aligned (DR_REF (dr));
1163       if (dump_enabled_p ())
1164         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165                          "Unknown misalignment, %snaturally aligned\n",
1166                          is_packed ? "not " : "");
1167       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1168     }
1169
1170   return true;
1171 }
1172
1173
1174 /* Calculate the cost of the memory access represented by DR.  */
1175
1176 static void
1177 vect_get_data_access_cost (struct data_reference *dr,
1178                            unsigned int *inside_cost,
1179                            unsigned int *outside_cost,
1180                            stmt_vector_for_cost *body_cost_vec)
1181 {
1182   gimple *stmt = DR_STMT (dr);
1183   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1184   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1185   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1186   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1187   int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly  */
1188
1189   if (DR_IS_READ (dr))
1190     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1191                         NULL, body_cost_vec, false);
1192   else
1193     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1194
1195   if (dump_enabled_p ())
1196     dump_printf_loc (MSG_NOTE, vect_location,
1197                      "vect_get_data_access_cost: inside_cost = %d, "
1198                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1199 }
1200
1201
1202 typedef struct _vect_peel_info
1203 {
1204   struct data_reference *dr;
1205   int npeel;
1206   unsigned int count;
1207 } *vect_peel_info;
1208
1209 typedef struct _vect_peel_extended_info
1210 {
1211   struct _vect_peel_info peel_info;
1212   unsigned int inside_cost;
1213   unsigned int outside_cost;
1214 } *vect_peel_extended_info;
1215
1216
1217 /* Peeling hashtable helpers.  */
1218
1219 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1220 {
1221   static inline hashval_t hash (const _vect_peel_info *);
1222   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1223 };
1224
1225 inline hashval_t
1226 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1227 {
1228   return (hashval_t) peel_info->npeel;
1229 }
1230
1231 inline bool
1232 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1233 {
1234   return (a->npeel == b->npeel);
1235 }
1236
1237
1238 /* Insert DR into peeling hash table with NPEEL as key.  */
1239
1240 static void
1241 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1242                           loop_vec_info loop_vinfo, struct data_reference *dr,
1243                           int npeel)
1244 {
1245   struct _vect_peel_info elem, *slot;
1246   _vect_peel_info **new_slot;
1247   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1248
1249   elem.npeel = npeel;
1250   slot = peeling_htab->find (&elem);
1251   if (slot)
1252     slot->count++;
1253   else
1254     {
1255       slot = XNEW (struct _vect_peel_info);
1256       slot->npeel = npeel;
1257       slot->dr = dr;
1258       slot->count = 1;
1259       new_slot = peeling_htab->find_slot (slot, INSERT);
1260       *new_slot = slot;
1261     }
1262
1263   if (!supportable_dr_alignment
1264       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1265     slot->count += VECT_MAX_COST;
1266 }
1267
1268
1269 /* Traverse peeling hash table to find peeling option that aligns maximum
1270    number of data accesses.  */
1271
1272 int
1273 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1274                                      _vect_peel_extended_info *max)
1275 {
1276   vect_peel_info elem = *slot;
1277
1278   if (elem->count > max->peel_info.count
1279       || (elem->count == max->peel_info.count
1280           && max->peel_info.npeel > elem->npeel))
1281     {
1282       max->peel_info.npeel = elem->npeel;
1283       max->peel_info.count = elem->count;
1284       max->peel_info.dr = elem->dr;
1285     }
1286
1287   return 1;
1288 }
1289
1290 /* Get the costs of peeling NPEEL iterations checking data access costs
1291    for all data refs.  If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1292    misalignment will be zero after peeling.  */
1293
1294 static void
1295 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1296                                 struct data_reference *dr0,
1297                                 unsigned int *inside_cost,
1298                                 unsigned int *outside_cost,
1299                                 stmt_vector_for_cost *body_cost_vec,
1300                                 unsigned int npeel,
1301                                 bool unknown_misalignment)
1302 {
1303   unsigned i;
1304   data_reference *dr;
1305
1306   FOR_EACH_VEC_ELT (datarefs, i, dr)
1307     {
1308       gimple *stmt = DR_STMT (dr);
1309       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1310       /* For interleaving, only the alignment of the first access
1311          matters.  */
1312       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1313           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1314         continue;
1315
1316       /* Strided accesses perform only component accesses, alignment is
1317          irrelevant for them.  */
1318       if (STMT_VINFO_STRIDED_P (stmt_info)
1319           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1320         continue;
1321
1322       int save_misalignment;
1323       save_misalignment = DR_MISALIGNMENT (dr);
1324       if (npeel == 0)
1325         ;
1326       else if (unknown_misalignment && dr == dr0)
1327         SET_DR_MISALIGNMENT (dr, 0);
1328       else
1329         vect_update_misalignment_for_peel (dr, dr0, npeel);
1330       vect_get_data_access_cost (dr, inside_cost, outside_cost,
1331                                  body_cost_vec);
1332       SET_DR_MISALIGNMENT (dr, save_misalignment);
1333     }
1334 }
1335
1336 /* Traverse peeling hash table and calculate cost for each peeling option.
1337    Find the one with the lowest cost.  */
1338
1339 int
1340 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1341                                    _vect_peel_extended_info *min)
1342 {
1343   vect_peel_info elem = *slot;
1344   int dummy;
1345   unsigned int inside_cost = 0, outside_cost = 0;
1346   gimple *stmt = DR_STMT (elem->dr);
1347   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1348   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1349   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1350                        epilogue_cost_vec;
1351
1352   prologue_cost_vec.create (2);
1353   body_cost_vec.create (2);
1354   epilogue_cost_vec.create (2);
1355
1356   vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1357                                   elem->dr, &inside_cost, &outside_cost,
1358                                   &body_cost_vec, elem->npeel, false);
1359
1360   body_cost_vec.release ();
1361
1362   outside_cost += vect_get_known_peeling_cost
1363     (loop_vinfo, elem->npeel, &dummy,
1364      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1365      &prologue_cost_vec, &epilogue_cost_vec);
1366
1367   /* Prologue and epilogue costs are added to the target model later.
1368      These costs depend only on the scalar iteration cost, the
1369      number of peeling iterations finally chosen, and the number of
1370      misaligned statements.  So discard the information found here.  */
1371   prologue_cost_vec.release ();
1372   epilogue_cost_vec.release ();
1373
1374   if (inside_cost < min->inside_cost
1375       || (inside_cost == min->inside_cost
1376           && outside_cost < min->outside_cost))
1377     {
1378       min->inside_cost = inside_cost;
1379       min->outside_cost = outside_cost;
1380       min->peel_info.dr = elem->dr;
1381       min->peel_info.npeel = elem->npeel;
1382       min->peel_info.count = elem->count;
1383     }
1384
1385   return 1;
1386 }
1387
1388
1389 /* Choose best peeling option by traversing peeling hash table and either
1390    choosing an option with the lowest cost (if cost model is enabled) or the
1391    option that aligns as many accesses as possible.  */
1392
1393 static struct _vect_peel_extended_info
1394 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1395                                        loop_vec_info loop_vinfo)
1396 {
1397    struct _vect_peel_extended_info res;
1398
1399    res.peel_info.dr = NULL;
1400
1401    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1402      {
1403        res.inside_cost = INT_MAX;
1404        res.outside_cost = INT_MAX;
1405        peeling_htab->traverse <_vect_peel_extended_info *,
1406                                vect_peeling_hash_get_lowest_cost> (&res);
1407      }
1408    else
1409      {
1410        res.peel_info.count = 0;
1411        peeling_htab->traverse <_vect_peel_extended_info *,
1412                                vect_peeling_hash_get_most_frequent> (&res);
1413        res.inside_cost = 0;
1414        res.outside_cost = 0;
1415      }
1416
1417    return res;
1418 }
1419
1420 /* Return true if the new peeling NPEEL is supported.  */
1421
1422 static bool
1423 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1424                           unsigned npeel)
1425 {
1426   unsigned i;
1427   struct data_reference *dr = NULL;
1428   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1429   gimple *stmt;
1430   stmt_vec_info stmt_info;
1431   enum dr_alignment_support supportable_dr_alignment;
1432
1433   /* Ensure that all data refs can be vectorized after the peel.  */
1434   FOR_EACH_VEC_ELT (datarefs, i, dr)
1435     {
1436       int save_misalignment;
1437
1438       if (dr == dr0)
1439         continue;
1440
1441       stmt = DR_STMT (dr);
1442       stmt_info = vinfo_for_stmt (stmt);
1443       /* For interleaving, only the alignment of the first access
1444          matters.  */
1445       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1446           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1447         continue;
1448
1449       /* Strided accesses perform only component accesses, alignment is
1450          irrelevant for them.  */
1451       if (STMT_VINFO_STRIDED_P (stmt_info)
1452           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1453         continue;
1454
1455       save_misalignment = DR_MISALIGNMENT (dr);
1456       vect_update_misalignment_for_peel (dr, dr0, npeel);
1457       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1458       SET_DR_MISALIGNMENT (dr, save_misalignment);
1459
1460       if (!supportable_dr_alignment)
1461         return false;
1462     }
1463
1464   return true;
1465 }
1466
1467 /* Function vect_enhance_data_refs_alignment
1468
1469    This pass will use loop versioning and loop peeling in order to enhance
1470    the alignment of data references in the loop.
1471
1472    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1473    original loop is to be vectorized.  Any other loops that are created by
1474    the transformations performed in this pass - are not supposed to be
1475    vectorized.  This restriction will be relaxed.
1476
1477    This pass will require a cost model to guide it whether to apply peeling
1478    or versioning or a combination of the two.  For example, the scheme that
1479    intel uses when given a loop with several memory accesses, is as follows:
1480    choose one memory access ('p') which alignment you want to force by doing
1481    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1482    other accesses are not necessarily aligned, or (2) use loop versioning to
1483    generate one loop in which all accesses are aligned, and another loop in
1484    which only 'p' is necessarily aligned.
1485
1486    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1487    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1488    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1489
1490    Devising a cost model is the most critical aspect of this work.  It will
1491    guide us on which access to peel for, whether to use loop versioning, how
1492    many versions to create, etc.  The cost model will probably consist of
1493    generic considerations as well as target specific considerations (on
1494    powerpc for example, misaligned stores are more painful than misaligned
1495    loads).
1496
1497    Here are the general steps involved in alignment enhancements:
1498
1499      -- original loop, before alignment analysis:
1500         for (i=0; i<N; i++){
1501           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1502           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1503         }
1504
1505      -- After vect_compute_data_refs_alignment:
1506         for (i=0; i<N; i++){
1507           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1508           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1509         }
1510
1511      -- Possibility 1: we do loop versioning:
1512      if (p is aligned) {
1513         for (i=0; i<N; i++){    # loop 1A
1514           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1515           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1516         }
1517      }
1518      else {
1519         for (i=0; i<N; i++){    # loop 1B
1520           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1521           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1522         }
1523      }
1524
1525      -- Possibility 2: we do loop peeling:
1526      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1527         x = q[i];
1528         p[i] = y;
1529      }
1530      for (i = 3; i < N; i++){   # loop 2A
1531         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1532         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1533      }
1534
1535      -- Possibility 3: combination of loop peeling and versioning:
1536      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1537         x = q[i];
1538         p[i] = y;
1539      }
1540      if (p is aligned) {
1541         for (i = 3; i<N; i++){  # loop 3A
1542           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1543           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1544         }
1545      }
1546      else {
1547         for (i = 3; i<N; i++){  # loop 3B
1548           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1549           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1550         }
1551      }
1552
1553      These loops are later passed to loop_transform to be vectorized.  The
1554      vectorizer will use the alignment information to guide the transformation
1555      (whether to generate regular loads/stores, or with special handling for
1556      misalignment).  */
1557
1558 bool
1559 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1560 {
1561   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1562   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   enum dr_alignment_support supportable_dr_alignment;
1564   struct data_reference *dr0 = NULL, *first_store = NULL;
1565   struct data_reference *dr;
1566   unsigned int i, j;
1567   bool do_peeling = false;
1568   bool do_versioning = false;
1569   bool stat;
1570   gimple *stmt;
1571   stmt_vec_info stmt_info;
1572   unsigned int npeel = 0;
1573   bool one_misalignment_known = false;
1574   bool one_misalignment_unknown = false;
1575   bool one_dr_unsupportable = false;
1576   struct data_reference *unsupportable_dr = NULL;
1577   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1578   unsigned possible_npeel_number = 1;
1579   tree vectype;
1580   unsigned int nelements, mis, same_align_drs_max = 0;
1581   hash_table<peel_info_hasher> peeling_htab (1);
1582
1583   if (dump_enabled_p ())
1584     dump_printf_loc (MSG_NOTE, vect_location,
1585                      "=== vect_enhance_data_refs_alignment ===\n");
1586
1587   /* Reset data so we can safely be called multiple times.  */
1588   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1589   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1590
1591   /* While cost model enhancements are expected in the future, the high level
1592      view of the code at this time is as follows:
1593
1594      A) If there is a misaligned access then see if peeling to align
1595         this access can make all data references satisfy
1596         vect_supportable_dr_alignment.  If so, update data structures
1597         as needed and return true.
1598
1599      B) If peeling wasn't possible and there is a data reference with an
1600         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1601         then see if loop versioning checks can be used to make all data
1602         references satisfy vect_supportable_dr_alignment.  If so, update
1603         data structures as needed and return true.
1604
1605      C) If neither peeling nor versioning were successful then return false if
1606         any data reference does not satisfy vect_supportable_dr_alignment.
1607
1608      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1609
1610      Note, Possibility 3 above (which is peeling and versioning together) is not
1611      being done at this time.  */
1612
1613   /* (1) Peeling to force alignment.  */
1614
1615   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1616      Considerations:
1617      + How many accesses will become aligned due to the peeling
1618      - How many accesses will become unaligned due to the peeling,
1619        and the cost of misaligned accesses.
1620      - The cost of peeling (the extra runtime checks, the increase
1621        in code size).  */
1622
1623   FOR_EACH_VEC_ELT (datarefs, i, dr)
1624     {
1625       stmt = DR_STMT (dr);
1626       stmt_info = vinfo_for_stmt (stmt);
1627
1628       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1629         continue;
1630
1631       /* For interleaving, only the alignment of the first access
1632          matters.  */
1633       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1634           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1635         continue;
1636
1637       /* For invariant accesses there is nothing to enhance.  */
1638       if (integer_zerop (DR_STEP (dr)))
1639         continue;
1640
1641       /* Strided accesses perform only component accesses, alignment is
1642          irrelevant for them.  */
1643       if (STMT_VINFO_STRIDED_P (stmt_info)
1644           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1645         continue;
1646
1647       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1648       do_peeling = vector_alignment_reachable_p (dr);
1649       if (do_peeling)
1650         {
1651           if (known_alignment_for_access_p (dr))
1652             {
1653               unsigned int npeel_tmp = 0;
1654               bool negative = tree_int_cst_compare (DR_STEP (dr),
1655                                                     size_zero_node) < 0;
1656
1657               vectype = STMT_VINFO_VECTYPE (stmt_info);
1658               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1659               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1660                                                 TREE_TYPE (DR_REF (dr))));
1661               if (DR_MISALIGNMENT (dr) != 0)
1662                 npeel_tmp = (negative ? (mis - nelements)
1663                              : (nelements - mis)) & (nelements - 1);
1664
1665               /* For multiple types, it is possible that the bigger type access
1666                  will have more than one peeling option.  E.g., a loop with two
1667                  types: one of size (vector size / 4), and the other one of
1668                  size (vector size / 8).  Vectorization factor will 8.  If both
1669                  accesses are misaligned by 3, the first one needs one scalar
1670                  iteration to be aligned, and the second one needs 5.  But the
1671                  first one will be aligned also by peeling 5 scalar
1672                  iterations, and in that case both accesses will be aligned.
1673                  Hence, except for the immediate peeling amount, we also want
1674                  to try to add full vector size, while we don't exceed
1675                  vectorization factor.
1676                  We do this automatically for cost model, since we calculate
1677                  cost for every peeling option.  */
1678               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1679                 {
1680                   if (STMT_SLP_TYPE (stmt_info))
1681                     possible_npeel_number
1682                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1683                   else
1684                     possible_npeel_number = vf / nelements;
1685
1686                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1687                      allow peeling NELEMENTS.  */
1688                   if (DR_MISALIGNMENT (dr) == 0)
1689                     possible_npeel_number++;
1690                 }
1691
1692               /* Save info about DR in the hash table.  Also include peeling
1693                  amounts according to the explanation above.  */
1694               for (j = 0; j < possible_npeel_number; j++)
1695                 {
1696                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1697                                             dr, npeel_tmp);
1698                   npeel_tmp += nelements;
1699                 }
1700
1701               one_misalignment_known = true;
1702             }
1703           else
1704             {
1705               /* If we don't know any misalignment values, we prefer
1706                  peeling for data-ref that has the maximum number of data-refs
1707                  with the same alignment, unless the target prefers to align
1708                  stores over load.  */
1709               unsigned same_align_drs
1710                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1711               if (!dr0
1712                   || same_align_drs_max < same_align_drs)
1713                 {
1714                   same_align_drs_max = same_align_drs;
1715                   dr0 = dr;
1716                 }
1717               /* For data-refs with the same number of related
1718                  accesses prefer the one where the misalign
1719                  computation will be invariant in the outermost loop.  */
1720               else if (same_align_drs_max == same_align_drs)
1721                 {
1722                   struct loop *ivloop0, *ivloop;
1723                   ivloop0 = outermost_invariant_loop_for_expr
1724                     (loop, DR_BASE_ADDRESS (dr0));
1725                   ivloop = outermost_invariant_loop_for_expr
1726                     (loop, DR_BASE_ADDRESS (dr));
1727                   if ((ivloop && !ivloop0)
1728                       || (ivloop && ivloop0
1729                           && flow_loop_nested_p (ivloop, ivloop0)))
1730                     dr0 = dr;
1731                 }
1732
1733               one_misalignment_unknown = true;
1734
1735               /* Check for data refs with unsupportable alignment that
1736                  can be peeled.  */
1737               if (!supportable_dr_alignment)
1738               {
1739                 one_dr_unsupportable = true;
1740                 unsupportable_dr = dr;
1741               }
1742
1743               if (!first_store && DR_IS_WRITE (dr))
1744                 first_store = dr;
1745             }
1746         }
1747       else
1748         {
1749           if (!aligned_access_p (dr))
1750             {
1751               if (dump_enabled_p ())
1752                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                                  "vector alignment may not be reachable\n");
1754               break;
1755             }
1756         }
1757     }
1758
1759   /* Check if we can possibly peel the loop.  */
1760   if (!vect_can_advance_ivs_p (loop_vinfo)
1761       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1762       || loop->inner)
1763     do_peeling = false;
1764
1765   struct _vect_peel_extended_info peel_for_known_alignment;
1766   struct _vect_peel_extended_info peel_for_unknown_alignment;
1767   struct _vect_peel_extended_info best_peel;
1768
1769   peel_for_unknown_alignment.inside_cost = INT_MAX;
1770   peel_for_unknown_alignment.outside_cost = INT_MAX;
1771   peel_for_unknown_alignment.peel_info.count = 0;
1772
1773   if (do_peeling
1774       && one_misalignment_unknown)
1775     {
1776       /* Check if the target requires to prefer stores over loads, i.e., if
1777          misaligned stores are more expensive than misaligned loads (taking
1778          drs with same alignment into account).  */
1779       unsigned int load_inside_cost = 0;
1780       unsigned int load_outside_cost = 0;
1781       unsigned int store_inside_cost = 0;
1782       unsigned int store_outside_cost = 0;
1783
1784       stmt_vector_for_cost dummy;
1785       dummy.create (2);
1786       vect_get_peeling_costs_all_drs (datarefs, dr0,
1787                                       &load_inside_cost,
1788                                       &load_outside_cost,
1789                                       &dummy, vf / 2, true);
1790       dummy.release ();
1791
1792       if (first_store)
1793         {
1794           dummy.create (2);
1795           vect_get_peeling_costs_all_drs (datarefs, first_store,
1796                                           &store_inside_cost,
1797                                           &store_outside_cost,
1798                                           &dummy, vf / 2, true);
1799           dummy.release ();
1800         }
1801       else
1802         {
1803           store_inside_cost = INT_MAX;
1804           store_outside_cost = INT_MAX;
1805         }
1806
1807       if (load_inside_cost > store_inside_cost
1808           || (load_inside_cost == store_inside_cost
1809               && load_outside_cost > store_outside_cost))
1810         {
1811           dr0 = first_store;
1812           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1813           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1814         }
1815       else
1816         {
1817           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1818           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1819         }
1820
1821       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1822       prologue_cost_vec.create (2);
1823       epilogue_cost_vec.create (2);
1824
1825       int dummy2;
1826       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1827         (loop_vinfo, vf / 2, &dummy2,
1828          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1829          &prologue_cost_vec, &epilogue_cost_vec);
1830
1831       prologue_cost_vec.release ();
1832       epilogue_cost_vec.release ();
1833
1834       peel_for_unknown_alignment.peel_info.count = 1
1835         + STMT_VINFO_SAME_ALIGN_REFS
1836         (vinfo_for_stmt (DR_STMT (dr0))).length ();
1837     }
1838
1839   peel_for_unknown_alignment.peel_info.npeel = 0;
1840   peel_for_unknown_alignment.peel_info.dr = dr0;
1841
1842   best_peel = peel_for_unknown_alignment;
1843
1844   peel_for_known_alignment.inside_cost = INT_MAX;
1845   peel_for_known_alignment.outside_cost = INT_MAX;
1846   peel_for_known_alignment.peel_info.count = 0;
1847   peel_for_known_alignment.peel_info.dr = NULL;
1848
1849   if (do_peeling && one_misalignment_known)
1850     {
1851       /* Peeling is possible, but there is no data access that is not supported
1852          unless aligned.  So we try to choose the best possible peeling from
1853          the hash table.  */
1854       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1855         (&peeling_htab, loop_vinfo);
1856     }
1857
1858   /* Compare costs of peeling for known and unknown alignment. */
1859   if (peel_for_known_alignment.peel_info.dr != NULL
1860       && peel_for_unknown_alignment.inside_cost
1861       >= peel_for_known_alignment.inside_cost)
1862     {
1863       best_peel = peel_for_known_alignment;
1864
1865       /* If the best peeling for known alignment has NPEEL == 0, perform no
1866          peeling at all except if there is an unsupportable dr that we can
1867          align.  */
1868       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1869         do_peeling = false;
1870     }
1871
1872   /* If there is an unsupportable data ref, prefer this over all choices so far
1873      since we'd have to discard a chosen peeling except when it accidentally
1874      aligned the unsupportable data ref.  */
1875   if (one_dr_unsupportable)
1876     dr0 = unsupportable_dr;
1877   else if (do_peeling)
1878     {
1879       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1880          TODO: Use nopeel_outside_cost or get rid of it?  */
1881       unsigned nopeel_inside_cost = 0;
1882       unsigned nopeel_outside_cost = 0;
1883
1884       stmt_vector_for_cost dummy;
1885       dummy.create (2);
1886       vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1887                                       &nopeel_outside_cost, &dummy, 0, false);
1888       dummy.release ();
1889
1890       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1891          costs will be recorded.  */
1892       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1893       prologue_cost_vec.create (2);
1894       epilogue_cost_vec.create (2);
1895
1896       int dummy2;
1897       nopeel_outside_cost += vect_get_known_peeling_cost
1898         (loop_vinfo, 0, &dummy2,
1899          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1900          &prologue_cost_vec, &epilogue_cost_vec);
1901
1902       prologue_cost_vec.release ();
1903       epilogue_cost_vec.release ();
1904
1905       npeel = best_peel.peel_info.npeel;
1906       dr0 = best_peel.peel_info.dr;
1907
1908       /* If no peeling is not more expensive than the best peeling we
1909          have so far, don't perform any peeling.  */
1910       if (nopeel_inside_cost <= best_peel.inside_cost)
1911         do_peeling = false;
1912     }
1913
1914   if (do_peeling)
1915     {
1916       stmt = DR_STMT (dr0);
1917       stmt_info = vinfo_for_stmt (stmt);
1918       vectype = STMT_VINFO_VECTYPE (stmt_info);
1919       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1920
1921       if (known_alignment_for_access_p (dr0))
1922         {
1923           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1924                                                 size_zero_node) < 0;
1925           if (!npeel)
1926             {
1927               /* Since it's known at compile time, compute the number of
1928                  iterations in the peeled loop (the peeling factor) for use in
1929                  updating DR_MISALIGNMENT values.  The peeling factor is the
1930                  vectorization factor minus the misalignment as an element
1931                  count.  */
1932               mis = DR_MISALIGNMENT (dr0);
1933               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1934               npeel = ((negative ? mis - nelements : nelements - mis)
1935                        & (nelements - 1));
1936             }
1937
1938           /* For interleaved data access every iteration accesses all the
1939              members of the group, therefore we divide the number of iterations
1940              by the group size.  */
1941           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1942           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1943             npeel /= GROUP_SIZE (stmt_info);
1944
1945           if (dump_enabled_p ())
1946             dump_printf_loc (MSG_NOTE, vect_location,
1947                              "Try peeling by %d\n", npeel);
1948         }
1949
1950       /* Ensure that all datarefs can be vectorized after the peel.  */
1951       if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1952         do_peeling = false;
1953
1954       /* Check if all datarefs are supportable and log.  */
1955       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1956         {
1957           stat = vect_verify_datarefs_alignment (loop_vinfo);
1958           if (!stat)
1959             do_peeling = false;
1960           else
1961             return stat;
1962         }
1963
1964       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1965       if (do_peeling)
1966         {
1967           unsigned max_allowed_peel
1968             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1969           if (max_allowed_peel != (unsigned)-1)
1970             {
1971               unsigned max_peel = npeel;
1972               if (max_peel == 0)
1973                 {
1974                   gimple *dr_stmt = DR_STMT (dr0);
1975                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1976                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1977                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1978                 }
1979               if (max_peel > max_allowed_peel)
1980                 {
1981                   do_peeling = false;
1982                   if (dump_enabled_p ())
1983                     dump_printf_loc (MSG_NOTE, vect_location,
1984                         "Disable peeling, max peels reached: %d\n", max_peel);
1985                 }
1986             }
1987         }
1988
1989       /* Cost model #2 - if peeling may result in a remaining loop not
1990          iterating enough to be vectorized then do not peel.  */
1991       if (do_peeling
1992           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1993         {
1994           unsigned max_peel
1995             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1996           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1997               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1998             do_peeling = false;
1999         }
2000
2001       if (do_peeling)
2002         {
2003           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2004              If the misalignment of DR_i is identical to that of dr0 then set
2005              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2006              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2007              by the peeling factor times the element size of DR_i (MOD the
2008              vectorization factor times the size).  Otherwise, the
2009              misalignment of DR_i must be set to unknown.  */
2010           FOR_EACH_VEC_ELT (datarefs, i, dr)
2011             if (dr != dr0)
2012               {
2013                 /* Strided accesses perform only component accesses, alignment
2014                    is irrelevant for them.  */
2015                 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2016                 if (STMT_VINFO_STRIDED_P (stmt_info)
2017                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2018                   continue;
2019
2020                 vect_update_misalignment_for_peel (dr, dr0, npeel);
2021               }
2022
2023           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2024           if (npeel)
2025             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2026           else
2027             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2028               = DR_MISALIGNMENT (dr0);
2029           SET_DR_MISALIGNMENT (dr0, 0);
2030           if (dump_enabled_p ())
2031             {
2032               dump_printf_loc (MSG_NOTE, vect_location,
2033                                "Alignment of access forced using peeling.\n");
2034               dump_printf_loc (MSG_NOTE, vect_location,
2035                                "Peeling for alignment will be applied.\n");
2036             }
2037
2038           /* The inside-loop cost will be accounted for in vectorizable_load
2039              and vectorizable_store correctly with adjusted alignments.
2040              Drop the body_cst_vec on the floor here.  */
2041           stat = vect_verify_datarefs_alignment (loop_vinfo);
2042           gcc_assert (stat);
2043           return stat;
2044         }
2045     }
2046
2047   /* (2) Versioning to force alignment.  */
2048
2049   /* Try versioning if:
2050      1) optimize loop for speed
2051      2) there is at least one unsupported misaligned data ref with an unknown
2052         misalignment, and
2053      3) all misaligned data refs with a known misalignment are supported, and
2054      4) the number of runtime alignment checks is within reason.  */
2055
2056   do_versioning =
2057         optimize_loop_nest_for_speed_p (loop)
2058         && (!loop->inner); /* FORNOW */
2059
2060   if (do_versioning)
2061     {
2062       FOR_EACH_VEC_ELT (datarefs, i, dr)
2063         {
2064           stmt = DR_STMT (dr);
2065           stmt_info = vinfo_for_stmt (stmt);
2066
2067           /* For interleaving, only the alignment of the first access
2068              matters.  */
2069           if (aligned_access_p (dr)
2070               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2071                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2072             continue;
2073
2074           if (STMT_VINFO_STRIDED_P (stmt_info))
2075             {
2076               /* Strided loads perform only component accesses, alignment is
2077                  irrelevant for them.  */
2078               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2079                 continue;
2080               do_versioning = false;
2081               break;
2082             }
2083
2084           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2085
2086           if (!supportable_dr_alignment)
2087             {
2088               gimple *stmt;
2089               int mask;
2090               tree vectype;
2091
2092               if (known_alignment_for_access_p (dr)
2093                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2094                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2095                 {
2096                   do_versioning = false;
2097                   break;
2098                 }
2099
2100               stmt = DR_STMT (dr);
2101               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2102               gcc_assert (vectype);
2103
2104               /* The rightmost bits of an aligned address must be zeros.
2105                  Construct the mask needed for this test.  For example,
2106                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2107                  mask must be 15 = 0xf. */
2108               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2109
2110               /* FORNOW: use the same mask to test all potentially unaligned
2111                  references in the loop.  The vectorizer currently supports
2112                  a single vector size, see the reference to
2113                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2114                  vectorization factor is computed.  */
2115               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2116                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2117               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2118               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2119                       DR_STMT (dr));
2120             }
2121         }
2122
2123       /* Versioning requires at least one misaligned data reference.  */
2124       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2125         do_versioning = false;
2126       else if (!do_versioning)
2127         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2128     }
2129
2130   if (do_versioning)
2131     {
2132       vec<gimple *> may_misalign_stmts
2133         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2134       gimple *stmt;
2135
2136       /* It can now be assumed that the data references in the statements
2137          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2138          of the loop being vectorized.  */
2139       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2140         {
2141           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2142           dr = STMT_VINFO_DATA_REF (stmt_info);
2143           SET_DR_MISALIGNMENT (dr, 0);
2144           if (dump_enabled_p ())
2145             dump_printf_loc (MSG_NOTE, vect_location,
2146                              "Alignment of access forced using versioning.\n");
2147         }
2148
2149       if (dump_enabled_p ())
2150         dump_printf_loc (MSG_NOTE, vect_location,
2151                          "Versioning for alignment will be applied.\n");
2152
2153       /* Peeling and versioning can't be done together at this time.  */
2154       gcc_assert (! (do_peeling && do_versioning));
2155
2156       stat = vect_verify_datarefs_alignment (loop_vinfo);
2157       gcc_assert (stat);
2158       return stat;
2159     }
2160
2161   /* This point is reached if neither peeling nor versioning is being done.  */
2162   gcc_assert (! (do_peeling || do_versioning));
2163
2164   stat = vect_verify_datarefs_alignment (loop_vinfo);
2165   return stat;
2166 }
2167
2168
2169 /* Function vect_find_same_alignment_drs.
2170
2171    Update group and alignment relations according to the chosen
2172    vectorization factor.  */
2173
2174 static void
2175 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2176 {
2177   struct data_reference *dra = DDR_A (ddr);
2178   struct data_reference *drb = DDR_B (ddr);
2179   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2180   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2181
2182   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2183     return;
2184
2185   if (dra == drb)
2186     return;
2187
2188   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2189       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2190       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2191     return;
2192
2193   /* Two references with distance zero have the same alignment.  */
2194   offset_int diff = (wi::to_offset (DR_INIT (dra))
2195                      - wi::to_offset (DR_INIT (drb)));
2196   if (diff != 0)
2197     {
2198       /* Get the wider of the two alignments.  */
2199       unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2200       unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2201       unsigned int max_align = MAX (align_a, align_b);
2202
2203       /* Require the gap to be a multiple of the larger vector alignment.  */
2204       if (!wi::multiple_of_p (diff, max_align, SIGNED))
2205         return;
2206     }
2207
2208   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2209   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2210   if (dump_enabled_p ())
2211     {
2212       dump_printf_loc (MSG_NOTE, vect_location,
2213                        "accesses have the same alignment: ");
2214       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2215       dump_printf (MSG_NOTE,  " and ");
2216       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2217       dump_printf (MSG_NOTE, "\n");
2218     }
2219 }
2220
2221
2222 /* Function vect_analyze_data_refs_alignment
2223
2224    Analyze the alignment of the data-references in the loop.
2225    Return FALSE if a data reference is found that cannot be vectorized.  */
2226
2227 bool
2228 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2229 {
2230   if (dump_enabled_p ())
2231     dump_printf_loc (MSG_NOTE, vect_location,
2232                      "=== vect_analyze_data_refs_alignment ===\n");
2233
2234   /* Mark groups of data references with same alignment using
2235      data dependence information.  */
2236   vec<ddr_p> ddrs = vinfo->ddrs;
2237   struct data_dependence_relation *ddr;
2238   unsigned int i;
2239
2240   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2241     vect_find_same_alignment_drs (ddr);
2242
2243   vec<data_reference_p> datarefs = vinfo->datarefs;
2244   struct data_reference *dr;
2245
2246   vect_record_base_alignments (vinfo);
2247   FOR_EACH_VEC_ELT (datarefs, i, dr)
2248     {
2249       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2250       if (STMT_VINFO_VECTORIZABLE (stmt_info)
2251           && !vect_compute_data_ref_alignment (dr))
2252         {
2253           /* Strided accesses perform only component accesses, misalignment
2254              information is irrelevant for them.  */
2255           if (STMT_VINFO_STRIDED_P (stmt_info)
2256               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2257             continue;
2258
2259           if (dump_enabled_p ())
2260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261                              "not vectorized: can't calculate alignment "
2262                              "for data ref.\n");
2263
2264           return false;
2265         }
2266     }
2267
2268   return true;
2269 }
2270
2271
2272 /* Analyze alignment of DRs of stmts in NODE.  */
2273
2274 static bool
2275 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2276 {
2277   /* We vectorize from the first scalar stmt in the node unless
2278      the node is permuted in which case we start from the first
2279      element in the group.  */
2280   gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2281   data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2282   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2283     first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2284
2285   data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2286   if (! vect_compute_data_ref_alignment (dr)
2287       /* For creating the data-ref pointer we need alignment of the
2288          first element anyway.  */
2289       || (dr != first_dr
2290           && ! vect_compute_data_ref_alignment (first_dr))
2291       || ! verify_data_ref_alignment (dr))
2292     {
2293       if (dump_enabled_p ())
2294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2295                          "not vectorized: bad data alignment in basic "
2296                          "block.\n");
2297       return false;
2298     }
2299
2300   return true;
2301 }
2302
2303 /* Function vect_slp_analyze_instance_alignment
2304
2305    Analyze the alignment of the data-references in the SLP instance.
2306    Return FALSE if a data reference is found that cannot be vectorized.  */
2307
2308 bool
2309 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2310 {
2311   if (dump_enabled_p ())
2312     dump_printf_loc (MSG_NOTE, vect_location,
2313                      "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2314
2315   slp_tree node;
2316   unsigned i;
2317   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2318     if (! vect_slp_analyze_and_verify_node_alignment (node))
2319       return false;
2320
2321   node = SLP_INSTANCE_TREE (instance);
2322   if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2323       && ! vect_slp_analyze_and_verify_node_alignment
2324              (SLP_INSTANCE_TREE (instance)))
2325     return false;
2326
2327   return true;
2328 }
2329
2330
2331 /* Analyze groups of accesses: check that DR belongs to a group of
2332    accesses of legal size, step, etc.  Detect gaps, single element
2333    interleaving, and other special cases. Set grouped access info.
2334    Collect groups of strided stores for further use in SLP analysis.
2335    Worker for vect_analyze_group_access.  */
2336
2337 static bool
2338 vect_analyze_group_access_1 (struct data_reference *dr)
2339 {
2340   tree step = DR_STEP (dr);
2341   tree scalar_type = TREE_TYPE (DR_REF (dr));
2342   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2343   gimple *stmt = DR_STMT (dr);
2344   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2345   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2346   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2347   HOST_WIDE_INT dr_step = -1;
2348   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2349   bool slp_impossible = false;
2350
2351   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2352      size of the interleaving group (including gaps).  */
2353   if (tree_fits_shwi_p (step))
2354     {
2355       dr_step = tree_to_shwi (step);
2356       /* Check that STEP is a multiple of type size.  Otherwise there is
2357          a non-element-sized gap at the end of the group which we
2358          cannot represent in GROUP_GAP or GROUP_SIZE.
2359          ???  As we can handle non-constant step fine here we should
2360          simply remove uses of GROUP_GAP between the last and first
2361          element and instead rely on DR_STEP.  GROUP_SIZE then would
2362          simply not include that gap.  */
2363       if ((dr_step % type_size) != 0)
2364         {
2365           if (dump_enabled_p ())
2366             {
2367               dump_printf_loc (MSG_NOTE, vect_location,
2368                                "Step ");
2369               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2370               dump_printf (MSG_NOTE,
2371                            " is not a multiple of the element size for ");
2372               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2373               dump_printf (MSG_NOTE, "\n");
2374             }
2375           return false;
2376         }
2377       groupsize = absu_hwi (dr_step) / type_size;
2378     }
2379   else
2380     groupsize = 0;
2381
2382   /* Not consecutive access is possible only if it is a part of interleaving.  */
2383   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2384     {
2385       /* Check if it this DR is a part of interleaving, and is a single
2386          element of the group that is accessed in the loop.  */
2387
2388       /* Gaps are supported only for loads. STEP must be a multiple of the type
2389          size.  The size of the group must be a power of 2.  */
2390       if (DR_IS_READ (dr)
2391           && (dr_step % type_size) == 0
2392           && groupsize > 0
2393           && pow2p_hwi (groupsize))
2394         {
2395           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2396           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2397           GROUP_GAP (stmt_info) = groupsize - 1;
2398           if (dump_enabled_p ())
2399             {
2400               dump_printf_loc (MSG_NOTE, vect_location,
2401                                "Detected single element interleaving ");
2402               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2403               dump_printf (MSG_NOTE, " step ");
2404               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2405               dump_printf (MSG_NOTE, "\n");
2406             }
2407
2408           return true;
2409         }
2410
2411       if (dump_enabled_p ())
2412         {
2413           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2414                            "not consecutive access ");
2415           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2416         }
2417
2418       if (bb_vinfo)
2419         {
2420           /* Mark the statement as unvectorizable.  */
2421           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2422           return true;
2423         }
2424
2425       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2426       STMT_VINFO_STRIDED_P (stmt_info) = true;
2427       return true;
2428     }
2429
2430   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2431     {
2432       /* First stmt in the interleaving chain. Check the chain.  */
2433       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2434       struct data_reference *data_ref = dr;
2435       unsigned int count = 1;
2436       tree prev_init = DR_INIT (data_ref);
2437       gimple *prev = stmt;
2438       HOST_WIDE_INT diff, gaps = 0;
2439
2440       while (next)
2441         {
2442           /* Skip same data-refs.  In case that two or more stmts share
2443              data-ref (supported only for loads), we vectorize only the first
2444              stmt, and the rest get their vectorized loads from the first
2445              one.  */
2446           if (!tree_int_cst_compare (DR_INIT (data_ref),
2447                                      DR_INIT (STMT_VINFO_DATA_REF (
2448                                                    vinfo_for_stmt (next)))))
2449             {
2450               if (DR_IS_WRITE (data_ref))
2451                 {
2452                   if (dump_enabled_p ())
2453                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2454                                      "Two store stmts share the same dr.\n");
2455                   return false;
2456                 }
2457
2458               if (dump_enabled_p ())
2459                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2460                                  "Two or more load stmts share the same dr.\n");
2461
2462               /* For load use the same data-ref load.  */
2463               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2464
2465               prev = next;
2466               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2467               continue;
2468             }
2469
2470           prev = next;
2471           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2472
2473           /* All group members have the same STEP by construction.  */
2474           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2475
2476           /* Check that the distance between two accesses is equal to the type
2477              size. Otherwise, we have gaps.  */
2478           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2479                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2480           if (diff != 1)
2481             {
2482               /* FORNOW: SLP of accesses with gaps is not supported.  */
2483               slp_impossible = true;
2484               if (DR_IS_WRITE (data_ref))
2485                 {
2486                   if (dump_enabled_p ())
2487                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2488                                      "interleaved store with gaps\n");
2489                   return false;
2490                 }
2491
2492               gaps += diff - 1;
2493             }
2494
2495           last_accessed_element += diff;
2496
2497           /* Store the gap from the previous member of the group. If there is no
2498              gap in the access, GROUP_GAP is always 1.  */
2499           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2500
2501           prev_init = DR_INIT (data_ref);
2502           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2503           /* Count the number of data-refs in the chain.  */
2504           count++;
2505         }
2506
2507       if (groupsize == 0)
2508         groupsize = count + gaps;
2509
2510       /* This could be UINT_MAX but as we are generating code in a very
2511          inefficient way we have to cap earlier.  See PR78699 for example.  */
2512       if (groupsize > 4096)
2513         {
2514           if (dump_enabled_p ())
2515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2516                              "group is too large\n");
2517           return false;
2518         }
2519
2520       /* Check that the size of the interleaving is equal to count for stores,
2521          i.e., that there are no gaps.  */
2522       if (groupsize != count
2523           && !DR_IS_READ (dr))
2524         {
2525           if (dump_enabled_p ())
2526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2527                              "interleaved store with gaps\n");
2528           return false;
2529         }
2530
2531       /* If there is a gap after the last load in the group it is the
2532          difference between the groupsize and the last accessed
2533          element.
2534          When there is no gap, this difference should be 0.  */
2535       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2536
2537       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2538       if (dump_enabled_p ())
2539         {
2540           dump_printf_loc (MSG_NOTE, vect_location,
2541                            "Detected interleaving ");
2542           if (DR_IS_READ (dr))
2543             dump_printf (MSG_NOTE, "load ");
2544           else
2545             dump_printf (MSG_NOTE, "store ");
2546           dump_printf (MSG_NOTE, "of size %u starting with ",
2547                        (unsigned)groupsize);
2548           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2549           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2550             dump_printf_loc (MSG_NOTE, vect_location,
2551                              "There is a gap of %u elements after the group\n",
2552                              GROUP_GAP (vinfo_for_stmt (stmt)));
2553         }
2554
2555       /* SLP: create an SLP data structure for every interleaving group of
2556          stores for further analysis in vect_analyse_slp.  */
2557       if (DR_IS_WRITE (dr) && !slp_impossible)
2558         {
2559           if (loop_vinfo)
2560             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2561           if (bb_vinfo)
2562             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2563         }
2564     }
2565
2566   return true;
2567 }
2568
2569 /* Analyze groups of accesses: check that DR belongs to a group of
2570    accesses of legal size, step, etc.  Detect gaps, single element
2571    interleaving, and other special cases. Set grouped access info.
2572    Collect groups of strided stores for further use in SLP analysis.  */
2573
2574 static bool
2575 vect_analyze_group_access (struct data_reference *dr)
2576 {
2577   if (!vect_analyze_group_access_1 (dr))
2578     {
2579       /* Dissolve the group if present.  */
2580       gimple *next;
2581       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2582       while (stmt)
2583         {
2584           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2585           next = GROUP_NEXT_ELEMENT (vinfo);
2586           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2587           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2588           stmt = next;
2589         }
2590       return false;
2591     }
2592   return true;
2593 }
2594
2595 /* Analyze the access pattern of the data-reference DR.
2596    In case of non-consecutive accesses call vect_analyze_group_access() to
2597    analyze groups of accesses.  */
2598
2599 static bool
2600 vect_analyze_data_ref_access (struct data_reference *dr)
2601 {
2602   tree step = DR_STEP (dr);
2603   tree scalar_type = TREE_TYPE (DR_REF (dr));
2604   gimple *stmt = DR_STMT (dr);
2605   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2606   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2607   struct loop *loop = NULL;
2608
2609   if (loop_vinfo)
2610     loop = LOOP_VINFO_LOOP (loop_vinfo);
2611
2612   if (loop_vinfo && !step)
2613     {
2614       if (dump_enabled_p ())
2615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2616                          "bad data-ref access in loop\n");
2617       return false;
2618     }
2619
2620   /* Allow loads with zero step in inner-loop vectorization.  */
2621   if (loop_vinfo && integer_zerop (step))
2622     {
2623       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2624       if (!nested_in_vect_loop_p (loop, stmt))
2625         return DR_IS_READ (dr);
2626       /* Allow references with zero step for outer loops marked
2627          with pragma omp simd only - it guarantees absence of
2628          loop-carried dependencies between inner loop iterations.  */
2629       if (!loop->force_vectorize)
2630         {
2631           if (dump_enabled_p ())
2632             dump_printf_loc (MSG_NOTE, vect_location,
2633                              "zero step in inner loop of nest\n");
2634           return false;
2635         }
2636     }
2637
2638   if (loop && nested_in_vect_loop_p (loop, stmt))
2639     {
2640       /* Interleaved accesses are not yet supported within outer-loop
2641         vectorization for references in the inner-loop.  */
2642       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2643
2644       /* For the rest of the analysis we use the outer-loop step.  */
2645       step = STMT_VINFO_DR_STEP (stmt_info);
2646       if (integer_zerop (step))
2647         {
2648           if (dump_enabled_p ())
2649             dump_printf_loc (MSG_NOTE, vect_location,
2650                              "zero step in outer loop.\n");
2651           return DR_IS_READ (dr);
2652         }
2653     }
2654
2655   /* Consecutive?  */
2656   if (TREE_CODE (step) == INTEGER_CST)
2657     {
2658       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2659       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2660           || (dr_step < 0
2661               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2662         {
2663           /* Mark that it is not interleaving.  */
2664           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2665           return true;
2666         }
2667     }
2668
2669   if (loop && nested_in_vect_loop_p (loop, stmt))
2670     {
2671       if (dump_enabled_p ())
2672         dump_printf_loc (MSG_NOTE, vect_location,
2673                          "grouped access in outer loop.\n");
2674       return false;
2675     }
2676
2677
2678   /* Assume this is a DR handled by non-constant strided load case.  */
2679   if (TREE_CODE (step) != INTEGER_CST)
2680     return (STMT_VINFO_STRIDED_P (stmt_info)
2681             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2682                 || vect_analyze_group_access (dr)));
2683
2684   /* Not consecutive access - check if it's a part of interleaving group.  */
2685   return vect_analyze_group_access (dr);
2686 }
2687
2688 /* Compare two data-references DRA and DRB to group them into chunks
2689    suitable for grouping.  */
2690
2691 static int
2692 dr_group_sort_cmp (const void *dra_, const void *drb_)
2693 {
2694   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2695   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2696   int cmp;
2697
2698   /* Stabilize sort.  */
2699   if (dra == drb)
2700     return 0;
2701
2702   /* DRs in different loops never belong to the same group.  */
2703   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2704   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2705   if (loopa != loopb)
2706     return loopa->num < loopb->num ? -1 : 1;
2707
2708   /* Ordering of DRs according to base.  */
2709   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2710     {
2711       cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2712                                    DR_BASE_ADDRESS (drb));
2713       if (cmp != 0)
2714         return cmp;
2715     }
2716
2717   /* And according to DR_OFFSET.  */
2718   if (!dr_equal_offsets_p (dra, drb))
2719     {
2720       cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2721       if (cmp != 0)
2722         return cmp;
2723     }
2724
2725   /* Put reads before writes.  */
2726   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2727     return DR_IS_READ (dra) ? -1 : 1;
2728
2729   /* Then sort after access size.  */
2730   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2731                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2732     {
2733       cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2734                                    TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2735       if (cmp != 0)
2736         return cmp;
2737     }
2738
2739   /* And after step.  */
2740   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2741     {
2742       cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2743       if (cmp != 0)
2744         return cmp;
2745     }
2746
2747   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2748   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2749   if (cmp == 0)
2750     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2751   return cmp;
2752 }
2753
2754 /* Function vect_analyze_data_ref_accesses.
2755
2756    Analyze the access pattern of all the data references in the loop.
2757
2758    FORNOW: the only access pattern that is considered vectorizable is a
2759            simple step 1 (consecutive) access.
2760
2761    FORNOW: handle only arrays and pointer accesses.  */
2762
2763 bool
2764 vect_analyze_data_ref_accesses (vec_info *vinfo)
2765 {
2766   unsigned int i;
2767   vec<data_reference_p> datarefs = vinfo->datarefs;
2768   struct data_reference *dr;
2769
2770   if (dump_enabled_p ())
2771     dump_printf_loc (MSG_NOTE, vect_location,
2772                      "=== vect_analyze_data_ref_accesses ===\n");
2773
2774   if (datarefs.is_empty ())
2775     return true;
2776
2777   /* Sort the array of datarefs to make building the interleaving chains
2778      linear.  Don't modify the original vector's order, it is needed for
2779      determining what dependencies are reversed.  */
2780   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2781   datarefs_copy.qsort (dr_group_sort_cmp);
2782
2783   /* Build the interleaving chains.  */
2784   for (i = 0; i < datarefs_copy.length () - 1;)
2785     {
2786       data_reference_p dra = datarefs_copy[i];
2787       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2788       stmt_vec_info lastinfo = NULL;
2789       if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2790         {
2791           ++i;
2792           continue;
2793         }
2794       for (i = i + 1; i < datarefs_copy.length (); ++i)
2795         {
2796           data_reference_p drb = datarefs_copy[i];
2797           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2798           if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2799             break;
2800
2801           /* ???  Imperfect sorting (non-compatible types, non-modulo
2802              accesses, same accesses) can lead to a group to be artificially
2803              split here as we don't just skip over those.  If it really
2804              matters we can push those to a worklist and re-iterate
2805              over them.  The we can just skip ahead to the next DR here.  */
2806
2807           /* DRs in a different loop should not be put into the same
2808              interleaving group.  */
2809           if (gimple_bb (DR_STMT (dra))->loop_father
2810               != gimple_bb (DR_STMT (drb))->loop_father)
2811             break;
2812
2813           /* Check that the data-refs have same first location (except init)
2814              and they are both either store or load (not load and store,
2815              not masked loads or stores).  */
2816           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2817               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2818                                    DR_BASE_ADDRESS (drb), 0)
2819               || !dr_equal_offsets_p (dra, drb)
2820               || !gimple_assign_single_p (DR_STMT (dra))
2821               || !gimple_assign_single_p (DR_STMT (drb)))
2822             break;
2823
2824           /* Check that the data-refs have the same constant size.  */
2825           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2826           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2827           if (!tree_fits_uhwi_p (sza)
2828               || !tree_fits_uhwi_p (szb)
2829               || !tree_int_cst_equal (sza, szb))
2830             break;
2831
2832           /* Check that the data-refs have the same step.  */
2833           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2834             break;
2835
2836           /* Do not place the same access in the interleaving chain twice.  */
2837           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2838             break;
2839
2840           /* Check the types are compatible.
2841              ???  We don't distinguish this during sorting.  */
2842           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2843                                    TREE_TYPE (DR_REF (drb))))
2844             break;
2845
2846           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2847           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2848           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2849           gcc_assert (init_a <= init_b);
2850
2851           /* If init_b == init_a + the size of the type * k, we have an
2852              interleaving, and DRA is accessed before DRB.  */
2853           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2854           if (type_size_a == 0
2855               || (init_b - init_a) % type_size_a != 0)
2856             break;
2857
2858           /* If we have a store, the accesses are adjacent.  This splits
2859              groups into chunks we support (we don't support vectorization
2860              of stores with gaps).  */
2861           if (!DR_IS_READ (dra)
2862               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2863                                              (DR_INIT (datarefs_copy[i-1]))
2864                   != type_size_a))
2865             break;
2866
2867           /* If the step (if not zero or non-constant) is greater than the
2868              difference between data-refs' inits this splits groups into
2869              suitable sizes.  */
2870           if (tree_fits_shwi_p (DR_STEP (dra)))
2871             {
2872               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2873               if (step != 0 && step <= (init_b - init_a))
2874                 break;
2875             }
2876
2877           if (dump_enabled_p ())
2878             {
2879               dump_printf_loc (MSG_NOTE, vect_location,
2880                                "Detected interleaving ");
2881               if (DR_IS_READ (dra))
2882                 dump_printf (MSG_NOTE, "load ");
2883               else
2884                 dump_printf (MSG_NOTE, "store ");
2885               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2886               dump_printf (MSG_NOTE,  " and ");
2887               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2888               dump_printf (MSG_NOTE, "\n");
2889             }
2890
2891           /* Link the found element into the group list.  */
2892           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2893             {
2894               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2895               lastinfo = stmtinfo_a;
2896             }
2897           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2898           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2899           lastinfo = stmtinfo_b;
2900         }
2901     }
2902
2903   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2904     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2905         && !vect_analyze_data_ref_access (dr))
2906       {
2907         if (dump_enabled_p ())
2908           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909                            "not vectorized: complicated access pattern.\n");
2910
2911         if (is_a <bb_vec_info> (vinfo))
2912           {
2913             /* Mark the statement as not vectorizable.  */
2914             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2915             continue;
2916           }
2917         else
2918           {
2919             datarefs_copy.release ();
2920             return false;
2921           }
2922       }
2923
2924   datarefs_copy.release ();
2925   return true;
2926 }
2927
2928 /* Function vect_vfa_segment_size.
2929
2930    Create an expression that computes the size of segment
2931    that will be accessed for a data reference.  The functions takes into
2932    account that realignment loads may access one more vector.
2933
2934    Input:
2935      DR: The data reference.
2936      LENGTH_FACTOR: segment length to consider.
2937
2938    Return an expression whose value is the size of segment which will be
2939    accessed by DR.  */
2940
2941 static tree
2942 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2943 {
2944   tree segment_length;
2945
2946   if (integer_zerop (DR_STEP (dr)))
2947     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2948   else
2949     segment_length = size_binop (MULT_EXPR,
2950                                  fold_convert (sizetype, DR_STEP (dr)),
2951                                  fold_convert (sizetype, length_factor));
2952
2953   if (vect_supportable_dr_alignment (dr, false)
2954         == dr_explicit_realign_optimized)
2955     {
2956       tree vector_size = TYPE_SIZE_UNIT
2957                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2958
2959       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2960     }
2961   return segment_length;
2962 }
2963
2964 /* Function vect_no_alias_p.
2965
2966    Given data references A and B with equal base and offset, the alias
2967    relation can be decided at compilation time, return TRUE if they do
2968    not alias to each other; return FALSE otherwise.  SEGMENT_LENGTH_A
2969    and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2970    respectively.  */
2971
2972 static bool
2973 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2974                  tree segment_length_a, tree segment_length_b)
2975 {
2976   gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2977               && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2978   if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2979     return false;
2980
2981   tree seg_a_min = DR_INIT (a);
2982   tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2983                                 seg_a_min, segment_length_a);
2984   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2985      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2986      [a, a+12) */
2987   if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2988     {
2989       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2990       seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2991                                seg_a_max, unit_size);
2992       seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2993                                DR_INIT (a), unit_size);
2994     }
2995   tree seg_b_min = DR_INIT (b);
2996   tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2997                                 seg_b_min, segment_length_b);
2998   if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2999     {
3000       tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3001       seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3002                                seg_b_max, unit_size);
3003       seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3004                                DR_INIT (b), unit_size);
3005     }
3006
3007   if (tree_int_cst_le (seg_a_max, seg_b_min)
3008       || tree_int_cst_le (seg_b_max, seg_a_min))
3009     return true;
3010
3011   return false;
3012 }
3013
3014 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3015    in DDR is >= VF.  */
3016
3017 static bool
3018 dependence_distance_ge_vf (data_dependence_relation *ddr,
3019                            unsigned int loop_depth, unsigned HOST_WIDE_INT vf)
3020 {
3021   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3022       || DDR_NUM_DIST_VECTS (ddr) == 0)
3023     return false;
3024
3025   /* If the dependence is exact, we should have limited the VF instead.  */
3026   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3027
3028   unsigned int i;
3029   lambda_vector dist_v;
3030   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3031     {
3032       HOST_WIDE_INT dist = dist_v[loop_depth];
3033       if (dist != 0
3034           && !(dist > 0 && DDR_REVERSED_P (ddr))
3035           && (unsigned HOST_WIDE_INT) abs_hwi (dist) < vf)
3036         return false;
3037     }
3038
3039   if (dump_enabled_p ())
3040     {
3041       dump_printf_loc (MSG_NOTE, vect_location,
3042                        "dependence distance between ");
3043       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3044       dump_printf (MSG_NOTE,  " and ");
3045       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3046       dump_printf (MSG_NOTE,  " is >= VF\n");
3047     }
3048
3049   return true;
3050 }
3051
3052 /* Function vect_prune_runtime_alias_test_list.
3053
3054    Prune a list of ddrs to be tested at run-time by versioning for alias.
3055    Merge several alias checks into one if possible.
3056    Return FALSE if resulting list of ddrs is longer then allowed by
3057    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3058
3059 bool
3060 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3061 {
3062   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3063   hash_set <tree_pair_hash> compared_objects;
3064
3065   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3066   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3067     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3068   vec<vec_object_pair> &check_unequal_addrs
3069     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3070   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3071   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3072
3073   ddr_p ddr;
3074   unsigned int i;
3075   tree length_factor;
3076
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079                      "=== vect_prune_runtime_alias_test_list ===\n");
3080
3081   if (may_alias_ddrs.is_empty ())
3082     return true;
3083
3084   comp_alias_ddrs.create (may_alias_ddrs.length ());
3085
3086   unsigned int loop_depth
3087     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3088                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3089
3090   /* First, we collect all data ref pairs for aliasing checks.  */
3091   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3092     {
3093       int comp_res;
3094       struct data_reference *dr_a, *dr_b;
3095       gimple *dr_group_first_a, *dr_group_first_b;
3096       tree segment_length_a, segment_length_b;
3097       gimple *stmt_a, *stmt_b;
3098
3099       /* Ignore the alias if the VF we chose ended up being no greater
3100          than the dependence distance.  */
3101       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3102         continue;
3103
3104       if (DDR_OBJECT_A (ddr))
3105         {
3106           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3107           if (!compared_objects.add (new_pair))
3108             {
3109               if (dump_enabled_p ())
3110                 {
3111                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3112                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3113                   dump_printf (MSG_NOTE, " and ");
3114                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3115                   dump_printf (MSG_NOTE, " have different addresses\n");
3116                 }
3117               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3118             }
3119           continue;
3120         }
3121
3122       dr_a = DDR_A (ddr);
3123       stmt_a = DR_STMT (DDR_A (ddr));
3124       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3125       if (dr_group_first_a)
3126         {
3127           stmt_a = dr_group_first_a;
3128           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3129         }
3130
3131       dr_b = DDR_B (ddr);
3132       stmt_b = DR_STMT (DDR_B (ddr));
3133       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3134       if (dr_group_first_b)
3135         {
3136           stmt_b = dr_group_first_b;
3137           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3138         }
3139
3140       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3141         length_factor = scalar_loop_iters;
3142       else
3143         length_factor = size_int (vect_factor);
3144       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3145       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3146
3147       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3148                                         DR_BASE_ADDRESS (dr_b));
3149       if (comp_res == 0)
3150         comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3151                                           DR_OFFSET (dr_b));
3152
3153       /* Alias is known at compilation time.  */
3154       if (comp_res == 0
3155           && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3156           && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3157           && TREE_CODE (segment_length_a) == INTEGER_CST
3158           && TREE_CODE (segment_length_b) == INTEGER_CST)
3159         {
3160           if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3161             continue;
3162
3163           if (dump_enabled_p ())
3164             dump_printf_loc (MSG_NOTE, vect_location,
3165                              "not vectorized: compilation time alias.\n");
3166
3167           return false;
3168         }
3169
3170       dr_with_seg_len_pair_t dr_with_seg_len_pair
3171           (dr_with_seg_len (dr_a, segment_length_a),
3172            dr_with_seg_len (dr_b, segment_length_b));
3173
3174       /* Canonicalize pairs by sorting the two DR members.  */
3175       if (comp_res > 0)
3176         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3177
3178       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3179     }
3180
3181   prune_runtime_alias_test_list (&comp_alias_ddrs,
3182                                  (unsigned HOST_WIDE_INT) vect_factor);
3183
3184   unsigned int count = (comp_alias_ddrs.length ()
3185                         + check_unequal_addrs.length ());
3186   dump_printf_loc (MSG_NOTE, vect_location,
3187                    "improved number of alias checks from %d to %d\n",
3188                    may_alias_ddrs.length (), count);
3189   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3190     {
3191       if (dump_enabled_p ())
3192         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3193                          "number of versioning for alias "
3194                          "run-time tests exceeds %d "
3195                          "(--param vect-max-version-for-alias-checks)\n",
3196                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3197       return false;
3198     }
3199
3200   return true;
3201 }
3202
3203 /* Return true if a non-affine read or write in STMT is suitable for a
3204    gather load or scatter store.  Describe the operation in *INFO if so.  */
3205
3206 bool
3207 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3208                            gather_scatter_info *info)
3209 {
3210   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3211   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3212   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3213   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3214   tree offtype = NULL_TREE;
3215   tree decl, base, off;
3216   machine_mode pmode;
3217   int punsignedp, reversep, pvolatilep = 0;
3218
3219   base = DR_REF (dr);
3220   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3221      see if we can use the def stmt of the address.  */
3222   if (is_gimple_call (stmt)
3223       && gimple_call_internal_p (stmt)
3224       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3225           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3226       && TREE_CODE (base) == MEM_REF
3227       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3228       && integer_zerop (TREE_OPERAND (base, 1))
3229       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3230     {
3231       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3232       if (is_gimple_assign (def_stmt)
3233           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3234         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3235     }
3236
3237   /* The gather and scatter builtins need address of the form
3238      loop_invariant + vector * {1, 2, 4, 8}
3239      or
3240      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3241      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3242      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3243      multiplications and additions in it.  To get a vector, we need
3244      a single SSA_NAME that will be defined in the loop and will
3245      contain everything that is not loop invariant and that can be
3246      vectorized.  The following code attempts to find such a preexistng
3247      SSA_NAME OFF and put the loop invariants into a tree BASE
3248      that can be gimplified before the loop.  */
3249   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3250                               &punsignedp, &reversep, &pvolatilep);
3251   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3252
3253   if (TREE_CODE (base) == MEM_REF)
3254     {
3255       if (!integer_zerop (TREE_OPERAND (base, 1)))
3256         {
3257           if (off == NULL_TREE)
3258             {
3259               offset_int moff = mem_ref_offset (base);
3260               off = wide_int_to_tree (sizetype, moff);
3261             }
3262           else
3263             off = size_binop (PLUS_EXPR, off,
3264                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3265         }
3266       base = TREE_OPERAND (base, 0);
3267     }
3268   else
3269     base = build_fold_addr_expr (base);
3270
3271   if (off == NULL_TREE)
3272     off = size_zero_node;
3273
3274   /* If base is not loop invariant, either off is 0, then we start with just
3275      the constant offset in the loop invariant BASE and continue with base
3276      as OFF, otherwise give up.
3277      We could handle that case by gimplifying the addition of base + off
3278      into some SSA_NAME and use that as off, but for now punt.  */
3279   if (!expr_invariant_in_loop_p (loop, base))
3280     {
3281       if (!integer_zerop (off))
3282         return false;
3283       off = base;
3284       base = size_int (pbitpos / BITS_PER_UNIT);
3285     }
3286   /* Otherwise put base + constant offset into the loop invariant BASE
3287      and continue with OFF.  */
3288   else
3289     {
3290       base = fold_convert (sizetype, base);
3291       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3292     }
3293
3294   /* OFF at this point may be either a SSA_NAME or some tree expression
3295      from get_inner_reference.  Try to peel off loop invariants from it
3296      into BASE as long as possible.  */
3297   STRIP_NOPS (off);
3298   while (offtype == NULL_TREE)
3299     {
3300       enum tree_code code;
3301       tree op0, op1, add = NULL_TREE;
3302
3303       if (TREE_CODE (off) == SSA_NAME)
3304         {
3305           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3306
3307           if (expr_invariant_in_loop_p (loop, off))
3308             return false;
3309
3310           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3311             break;
3312
3313           op0 = gimple_assign_rhs1 (def_stmt);
3314           code = gimple_assign_rhs_code (def_stmt);
3315           op1 = gimple_assign_rhs2 (def_stmt);
3316         }
3317       else
3318         {
3319           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3320             return false;
3321           code = TREE_CODE (off);
3322           extract_ops_from_tree (off, &code, &op0, &op1);
3323         }
3324       switch (code)
3325         {
3326         case POINTER_PLUS_EXPR:
3327         case PLUS_EXPR:
3328           if (expr_invariant_in_loop_p (loop, op0))
3329             {
3330               add = op0;
3331               off = op1;
3332             do_add:
3333               add = fold_convert (sizetype, add);
3334               if (scale != 1)
3335                 add = size_binop (MULT_EXPR, add, size_int (scale));
3336               base = size_binop (PLUS_EXPR, base, add);
3337               continue;
3338             }
3339           if (expr_invariant_in_loop_p (loop, op1))
3340             {
3341               add = op1;
3342               off = op0;
3343               goto do_add;
3344             }
3345           break;
3346         case MINUS_EXPR:
3347           if (expr_invariant_in_loop_p (loop, op1))
3348             {
3349               add = fold_convert (sizetype, op1);
3350               add = size_binop (MINUS_EXPR, size_zero_node, add);
3351               off = op0;
3352               goto do_add;
3353             }
3354           break;
3355         case MULT_EXPR:
3356           if (scale == 1 && tree_fits_shwi_p (op1))
3357             {
3358               scale = tree_to_shwi (op1);
3359               off = op0;
3360               continue;
3361             }
3362           break;
3363         case SSA_NAME:
3364           off = op0;
3365           continue;
3366         CASE_CONVERT:
3367           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3368               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3369             break;
3370           if (TYPE_PRECISION (TREE_TYPE (op0))
3371               == TYPE_PRECISION (TREE_TYPE (off)))
3372             {
3373               off = op0;
3374               continue;
3375             }
3376           if (TYPE_PRECISION (TREE_TYPE (op0))
3377               < TYPE_PRECISION (TREE_TYPE (off)))
3378             {
3379               off = op0;
3380               offtype = TREE_TYPE (off);
3381               STRIP_NOPS (off);
3382               continue;
3383             }
3384           break;
3385         default:
3386           break;
3387         }
3388       break;
3389     }
3390
3391   /* If at the end OFF still isn't a SSA_NAME or isn't
3392      defined in the loop, punt.  */
3393   if (TREE_CODE (off) != SSA_NAME
3394       || expr_invariant_in_loop_p (loop, off))
3395     return false;
3396
3397   if (offtype == NULL_TREE)
3398     offtype = TREE_TYPE (off);
3399
3400   if (DR_IS_READ (dr))
3401     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3402                                              offtype, scale);
3403   else
3404     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3405                                               offtype, scale);
3406
3407   if (decl == NULL_TREE)
3408     return false;
3409
3410   info->decl = decl;
3411   info->base = base;
3412   info->offset = off;
3413   info->offset_dt = vect_unknown_def_type;
3414   info->offset_vectype = NULL_TREE;
3415   info->scale = scale;
3416   return true;
3417 }
3418
3419 /* Function vect_analyze_data_refs.
3420
3421   Find all the data references in the loop or basic block.
3422
3423    The general structure of the analysis of data refs in the vectorizer is as
3424    follows:
3425    1- vect_analyze_data_refs(loop/bb): call
3426       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3427       in the loop/bb and their dependences.
3428    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3429    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3430    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3431
3432 */
3433
3434 bool
3435 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3436 {
3437   struct loop *loop = NULL;
3438   unsigned int i;
3439   struct data_reference *dr;
3440   tree scalar_type;
3441
3442   if (dump_enabled_p ())
3443     dump_printf_loc (MSG_NOTE, vect_location,
3444                      "=== vect_analyze_data_refs ===\n");
3445
3446   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3447     loop = LOOP_VINFO_LOOP (loop_vinfo);
3448
3449   /* Go through the data-refs, check that the analysis succeeded.  Update
3450      pointer from stmt_vec_info struct to DR and vectype.  */
3451
3452   vec<data_reference_p> datarefs = vinfo->datarefs;
3453   FOR_EACH_VEC_ELT (datarefs, i, dr)
3454     {
3455       gimple *stmt;
3456       stmt_vec_info stmt_info;
3457       tree base, offset, init;
3458       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3459       bool simd_lane_access = false;
3460       int vf;
3461
3462 again:
3463       if (!dr || !DR_REF (dr))
3464         {
3465           if (dump_enabled_p ())
3466             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467                              "not vectorized: unhandled data-ref\n");
3468           return false;
3469         }
3470
3471       stmt = DR_STMT (dr);
3472       stmt_info = vinfo_for_stmt (stmt);
3473
3474       /* Discard clobbers from the dataref vector.  We will remove
3475          clobber stmts during vectorization.  */
3476       if (gimple_clobber_p (stmt))
3477         {
3478           free_data_ref (dr);
3479           if (i == datarefs.length () - 1)
3480             {
3481               datarefs.pop ();
3482               break;
3483             }
3484           datarefs.ordered_remove (i);
3485           dr = datarefs[i];
3486           goto again;
3487         }
3488
3489       /* Check that analysis of the data-ref succeeded.  */
3490       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3491           || !DR_STEP (dr))
3492         {
3493           bool maybe_gather
3494             = DR_IS_READ (dr)
3495               && !TREE_THIS_VOLATILE (DR_REF (dr))
3496               && targetm.vectorize.builtin_gather != NULL;
3497           bool maybe_scatter
3498             = DR_IS_WRITE (dr)
3499               && !TREE_THIS_VOLATILE (DR_REF (dr))
3500               && targetm.vectorize.builtin_scatter != NULL;
3501           bool maybe_simd_lane_access
3502             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3503
3504           /* If target supports vector gather loads or scatter stores, or if
3505              this might be a SIMD lane access, see if they can't be used.  */
3506           if (is_a <loop_vec_info> (vinfo)
3507               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3508               && !nested_in_vect_loop_p (loop, stmt))
3509             {
3510               struct data_reference *newdr
3511                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3512                                    DR_REF (dr), stmt, !maybe_scatter,
3513                                    DR_IS_CONDITIONAL_IN_STMT (dr));
3514               gcc_assert (newdr != NULL && DR_REF (newdr));
3515               if (DR_BASE_ADDRESS (newdr)
3516                   && DR_OFFSET (newdr)
3517                   && DR_INIT (newdr)
3518                   && DR_STEP (newdr)
3519                   && integer_zerop (DR_STEP (newdr)))
3520                 {
3521                   if (maybe_simd_lane_access)
3522                     {
3523                       tree off = DR_OFFSET (newdr);
3524                       STRIP_NOPS (off);
3525                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3526                           && TREE_CODE (off) == MULT_EXPR
3527                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3528                         {
3529                           tree step = TREE_OPERAND (off, 1);
3530                           off = TREE_OPERAND (off, 0);
3531                           STRIP_NOPS (off);
3532                           if (CONVERT_EXPR_P (off)
3533                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3534                                                                           0)))
3535                                  < TYPE_PRECISION (TREE_TYPE (off)))
3536                             off = TREE_OPERAND (off, 0);
3537                           if (TREE_CODE (off) == SSA_NAME)
3538                             {
3539                               gimple *def = SSA_NAME_DEF_STMT (off);
3540                               tree reft = TREE_TYPE (DR_REF (newdr));
3541                               if (is_gimple_call (def)
3542                                   && gimple_call_internal_p (def)
3543                                   && (gimple_call_internal_fn (def)
3544                                       == IFN_GOMP_SIMD_LANE))
3545                                 {
3546                                   tree arg = gimple_call_arg (def, 0);
3547                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3548                                   arg = SSA_NAME_VAR (arg);
3549                                   if (arg == loop->simduid
3550                                       /* For now.  */
3551                                       && tree_int_cst_equal
3552                                            (TYPE_SIZE_UNIT (reft),
3553                                             step))
3554                                     {
3555                                       DR_OFFSET (newdr) = ssize_int (0);
3556                                       DR_STEP (newdr) = step;
3557                                       DR_OFFSET_ALIGNMENT (newdr)
3558                                         = BIGGEST_ALIGNMENT;
3559                                       DR_STEP_ALIGNMENT (newdr)
3560                                         = highest_pow2_factor (step);
3561                                       dr = newdr;
3562                                       simd_lane_access = true;
3563                                     }
3564                                 }
3565                             }
3566                         }
3567                     }
3568                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3569                     {
3570                       dr = newdr;
3571                       if (maybe_gather)
3572                         gatherscatter = GATHER;
3573                       else
3574                         gatherscatter = SCATTER;
3575                     }
3576                 }
3577               if (gatherscatter == SG_NONE && !simd_lane_access)
3578                 free_data_ref (newdr);
3579             }
3580
3581           if (gatherscatter == SG_NONE && !simd_lane_access)
3582             {
3583               if (dump_enabled_p ())
3584                 {
3585                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586                                    "not vectorized: data ref analysis "
3587                                    "failed ");
3588                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3589                 }
3590
3591               if (is_a <bb_vec_info> (vinfo))
3592                 break;
3593
3594               return false;
3595             }
3596         }
3597
3598       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3599         {
3600           if (dump_enabled_p ())
3601             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602                              "not vectorized: base addr of dr is a "
3603                              "constant\n");
3604
3605           if (is_a <bb_vec_info> (vinfo))
3606             break;
3607
3608           if (gatherscatter != SG_NONE || simd_lane_access)
3609             free_data_ref (dr);
3610           return false;
3611         }
3612
3613       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3614         {
3615           if (dump_enabled_p ())
3616             {
3617               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3618                                "not vectorized: volatile type ");
3619               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3620             }
3621
3622           if (is_a <bb_vec_info> (vinfo))
3623             break;
3624
3625           return false;
3626         }
3627
3628       if (stmt_can_throw_internal (stmt))
3629         {
3630           if (dump_enabled_p ())
3631             {
3632               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3633                                "not vectorized: statement can throw an "
3634                                "exception ");
3635               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3636             }
3637
3638           if (is_a <bb_vec_info> (vinfo))
3639             break;
3640
3641           if (gatherscatter != SG_NONE || simd_lane_access)
3642             free_data_ref (dr);
3643           return false;
3644         }
3645
3646       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3647           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3648         {
3649           if (dump_enabled_p ())
3650             {
3651               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3652                                "not vectorized: statement is bitfield "
3653                                "access ");
3654               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3655             }
3656
3657           if (is_a <bb_vec_info> (vinfo))
3658             break;
3659
3660           if (gatherscatter != SG_NONE || simd_lane_access)
3661             free_data_ref (dr);
3662           return false;
3663         }
3664
3665       base = unshare_expr (DR_BASE_ADDRESS (dr));
3666       offset = unshare_expr (DR_OFFSET (dr));
3667       init = unshare_expr (DR_INIT (dr));
3668
3669       if (is_gimple_call (stmt)
3670           && (!gimple_call_internal_p (stmt)
3671               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3672                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3673         {
3674           if (dump_enabled_p ())
3675             {
3676               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3677                                "not vectorized: dr in a call ");
3678               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3679             }
3680
3681           if (is_a <bb_vec_info> (vinfo))
3682             break;
3683
3684           if (gatherscatter != SG_NONE || simd_lane_access)
3685             free_data_ref (dr);
3686           return false;
3687         }
3688
3689       /* Update DR field in stmt_vec_info struct.  */
3690
3691       /* If the dataref is in an inner-loop of the loop that is considered for
3692          for vectorization, we also want to analyze the access relative to
3693          the outer-loop (DR contains information only relative to the
3694          inner-most enclosing loop).  We do that by building a reference to the
3695          first location accessed by the inner-loop, and analyze it relative to
3696          the outer-loop.  */
3697       if (loop && nested_in_vect_loop_p (loop, stmt))
3698         {
3699           /* Build a reference to the first location accessed by the
3700              inner loop: *(BASE + INIT + OFFSET).  By construction,
3701              this address must be invariant in the inner loop, so we
3702              can consider it as being used in the outer loop.  */
3703           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3704                                           init, offset);
3705           tree init_addr = fold_build_pointer_plus (base, init_offset);
3706           tree init_ref = build_fold_indirect_ref (init_addr);
3707
3708           if (dump_enabled_p ())
3709             {
3710               dump_printf_loc (MSG_NOTE, vect_location,
3711                                "analyze in outer loop: ");
3712               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3713               dump_printf (MSG_NOTE, "\n");
3714             }
3715
3716           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3717                                      init_ref, loop))
3718             /* dr_analyze_innermost already explained the failure.  */
3719             return false;
3720
3721           if (dump_enabled_p ())
3722             {
3723               dump_printf_loc (MSG_NOTE, vect_location,
3724                                "\touter base_address: ");
3725               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3726                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3727               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3728               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3729                                  STMT_VINFO_DR_OFFSET (stmt_info));
3730               dump_printf (MSG_NOTE,
3731                            "\n\touter constant offset from base address: ");
3732               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3733                                  STMT_VINFO_DR_INIT (stmt_info));
3734               dump_printf (MSG_NOTE, "\n\touter step: ");
3735               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3736                                  STMT_VINFO_DR_STEP (stmt_info));
3737               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3738                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3739               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3740                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3741               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3742                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3743               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3744                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3745             }
3746         }
3747
3748       if (STMT_VINFO_DATA_REF (stmt_info))
3749         {
3750           if (dump_enabled_p ())
3751             {
3752               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3753                                "not vectorized: more than one data ref "
3754                                "in stmt: ");
3755               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3756             }
3757
3758           if (is_a <bb_vec_info> (vinfo))
3759             break;
3760
3761           if (gatherscatter != SG_NONE || simd_lane_access)
3762             free_data_ref (dr);
3763           return false;
3764         }
3765
3766       STMT_VINFO_DATA_REF (stmt_info) = dr;
3767       if (simd_lane_access)
3768         {
3769           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3770           free_data_ref (datarefs[i]);
3771           datarefs[i] = dr;
3772         }
3773
3774       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3775           && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3776           && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3777         {
3778           if (dump_enabled_p ())
3779             {
3780               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3781                                "not vectorized: base object not addressable "
3782                                "for stmt: ");
3783               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3784             }
3785           if (is_a <bb_vec_info> (vinfo))
3786             {
3787               /* In BB vectorization the ref can still participate
3788                  in dependence analysis, we just can't vectorize it.  */
3789               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3790               continue;
3791             }
3792           return false;
3793         }
3794
3795       /* Set vectype for STMT.  */
3796       scalar_type = TREE_TYPE (DR_REF (dr));
3797       STMT_VINFO_VECTYPE (stmt_info)
3798         = get_vectype_for_scalar_type (scalar_type);
3799       if (!STMT_VINFO_VECTYPE (stmt_info))
3800         {
3801           if (dump_enabled_p ())
3802             {
3803               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3804                                "not vectorized: no vectype for stmt: ");
3805               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3806               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3807               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3808                                  scalar_type);
3809               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3810             }
3811
3812           if (is_a <bb_vec_info> (vinfo))
3813             {
3814               /* No vector type is fine, the ref can still participate
3815                  in dependence analysis, we just can't vectorize it.  */
3816               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3817               continue;
3818             }
3819
3820           if (gatherscatter != SG_NONE || simd_lane_access)
3821             {
3822               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3823               if (gatherscatter != SG_NONE)
3824                 free_data_ref (dr);
3825             }
3826           return false;
3827         }
3828       else
3829         {
3830           if (dump_enabled_p ())
3831             {
3832               dump_printf_loc (MSG_NOTE, vect_location,
3833                                "got vectype for stmt: ");
3834               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3835               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3836                                  STMT_VINFO_VECTYPE (stmt_info));
3837               dump_printf (MSG_NOTE, "\n");
3838             }
3839         }
3840
3841       /* Adjust the minimal vectorization factor according to the
3842          vector type.  */
3843       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3844       if (vf > *min_vf)
3845         *min_vf = vf;
3846
3847       if (gatherscatter != SG_NONE)
3848         {
3849           gather_scatter_info gs_info;
3850           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3851                                           &gs_info)
3852               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3853             {
3854               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3855               free_data_ref (dr);
3856               if (dump_enabled_p ())
3857                 {
3858                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3859                                    (gatherscatter == GATHER) ?
3860                                    "not vectorized: not suitable for gather "
3861                                    "load " :
3862                                    "not vectorized: not suitable for scatter "
3863                                    "store ");
3864                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3865                 }
3866               return false;
3867             }
3868
3869           free_data_ref (datarefs[i]);
3870           datarefs[i] = dr;
3871           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3872         }
3873
3874       else if (is_a <loop_vec_info> (vinfo)
3875                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3876         {
3877           if (nested_in_vect_loop_p (loop, stmt))
3878             {
3879               if (dump_enabled_p ())
3880                 {
3881                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3882                                    "not vectorized: not suitable for strided "
3883                                    "load ");
3884                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3885                 }
3886               return false;
3887             }
3888           STMT_VINFO_STRIDED_P (stmt_info) = true;
3889         }
3890     }
3891
3892   /* If we stopped analysis at the first dataref we could not analyze
3893      when trying to vectorize a basic-block mark the rest of the datarefs
3894      as not vectorizable and truncate the vector of datarefs.  That
3895      avoids spending useless time in analyzing their dependence.  */
3896   if (i != datarefs.length ())
3897     {
3898       gcc_assert (is_a <bb_vec_info> (vinfo));
3899       for (unsigned j = i; j < datarefs.length (); ++j)
3900         {
3901           data_reference_p dr = datarefs[j];
3902           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3903           free_data_ref (dr);
3904         }
3905       datarefs.truncate (i);
3906     }
3907
3908   return true;
3909 }
3910
3911
3912 /* Function vect_get_new_vect_var.
3913
3914    Returns a name for a new variable.  The current naming scheme appends the
3915    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3916    the name of vectorizer generated variables, and appends that to NAME if
3917    provided.  */
3918
3919 tree
3920 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3921 {
3922   const char *prefix;
3923   tree new_vect_var;
3924
3925   switch (var_kind)
3926   {
3927   case vect_simple_var:
3928     prefix = "vect";
3929     break;
3930   case vect_scalar_var:
3931     prefix = "stmp";
3932     break;
3933   case vect_mask_var:
3934     prefix = "mask";
3935     break;
3936   case vect_pointer_var:
3937     prefix = "vectp";
3938     break;
3939   default:
3940     gcc_unreachable ();
3941   }
3942
3943   if (name)
3944     {
3945       char* tmp = concat (prefix, "_", name, NULL);
3946       new_vect_var = create_tmp_reg (type, tmp);
3947       free (tmp);
3948     }
3949   else
3950     new_vect_var = create_tmp_reg (type, prefix);
3951
3952   return new_vect_var;
3953 }
3954
3955 /* Like vect_get_new_vect_var but return an SSA name.  */
3956
3957 tree
3958 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3959 {
3960   const char *prefix;
3961   tree new_vect_var;
3962
3963   switch (var_kind)
3964   {
3965   case vect_simple_var:
3966     prefix = "vect";
3967     break;
3968   case vect_scalar_var:
3969     prefix = "stmp";
3970     break;
3971   case vect_pointer_var:
3972     prefix = "vectp";
3973     break;
3974   default:
3975     gcc_unreachable ();
3976   }
3977
3978   if (name)
3979     {
3980       char* tmp = concat (prefix, "_", name, NULL);
3981       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3982       free (tmp);
3983     }
3984   else
3985     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3986
3987   return new_vect_var;
3988 }
3989
3990 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3991
3992 static void
3993 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3994                                   stmt_vec_info stmt_info)
3995 {
3996   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3997   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3998   int misalign = DR_MISALIGNMENT (dr);
3999   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4000     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4001   else
4002     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4003 }
4004
4005 /* Function vect_create_addr_base_for_vector_ref.
4006
4007    Create an expression that computes the address of the first memory location
4008    that will be accessed for a data reference.
4009
4010    Input:
4011    STMT: The statement containing the data reference.
4012    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4013    OFFSET: Optional. If supplied, it is be added to the initial address.
4014    LOOP:    Specify relative to which loop-nest should the address be computed.
4015             For example, when the dataref is in an inner-loop nested in an
4016             outer-loop that is now being vectorized, LOOP can be either the
4017             outer-loop, or the inner-loop.  The first memory location accessed
4018             by the following dataref ('in' points to short):
4019
4020                 for (i=0; i<N; i++)
4021                    for (j=0; j<M; j++)
4022                      s += in[i+j]
4023
4024             is as follows:
4025             if LOOP=i_loop:     &in             (relative to i_loop)
4026             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4027    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4028             initial address.  Unlike OFFSET, which is number of elements to
4029             be added, BYTE_OFFSET is measured in bytes.
4030
4031    Output:
4032    1. Return an SSA_NAME whose value is the address of the memory location of
4033       the first vector of the data reference.
4034    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4035       these statement(s) which define the returned SSA_NAME.
4036
4037    FORNOW: We are only handling array accesses with step 1.  */
4038
4039 tree
4040 vect_create_addr_base_for_vector_ref (gimple *stmt,
4041                                       gimple_seq *new_stmt_list,
4042                                       tree offset,
4043                                       tree byte_offset)
4044 {
4045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4046   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4047   const char *base_name;
4048   tree addr_base;
4049   tree dest;
4050   gimple_seq seq = NULL;
4051   tree vect_ptr_type;
4052   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4053   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4054   innermost_loop_behavior *drb = vect_dr_behavior (dr);
4055
4056   tree data_ref_base = unshare_expr (drb->base_address);
4057   tree base_offset = unshare_expr (drb->offset);
4058   tree init = unshare_expr (drb->init);
4059
4060   if (loop_vinfo)
4061     base_name = get_name (data_ref_base);
4062   else
4063     {
4064       base_offset = ssize_int (0);
4065       init = ssize_int (0);
4066       base_name = get_name (DR_REF (dr));
4067     }
4068
4069   /* Create base_offset */
4070   base_offset = size_binop (PLUS_EXPR,
4071                             fold_convert (sizetype, base_offset),
4072                             fold_convert (sizetype, init));
4073
4074   if (offset)
4075     {
4076       offset = fold_build2 (MULT_EXPR, sizetype,
4077                             fold_convert (sizetype, offset), step);
4078       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4079                                  base_offset, offset);
4080     }
4081   if (byte_offset)
4082     {
4083       byte_offset = fold_convert (sizetype, byte_offset);
4084       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4085                                  base_offset, byte_offset);
4086     }
4087
4088   /* base + base_offset */
4089   if (loop_vinfo)
4090     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4091   else
4092     {
4093       addr_base = build1 (ADDR_EXPR,
4094                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4095                           unshare_expr (DR_REF (dr)));
4096     }
4097
4098   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4099   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4100   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4101   gimple_seq_add_seq (new_stmt_list, seq);
4102
4103   if (DR_PTR_INFO (dr)
4104       && TREE_CODE (addr_base) == SSA_NAME
4105       && !SSA_NAME_PTR_INFO (addr_base))
4106     {
4107       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4108       if (offset || byte_offset)
4109         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4110     }
4111
4112   if (dump_enabled_p ())
4113     {
4114       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4115       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4116       dump_printf (MSG_NOTE, "\n");
4117     }
4118
4119   return addr_base;
4120 }
4121
4122
4123 /* Function vect_create_data_ref_ptr.
4124
4125    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4126    location accessed in the loop by STMT, along with the def-use update
4127    chain to appropriately advance the pointer through the loop iterations.
4128    Also set aliasing information for the pointer.  This pointer is used by
4129    the callers to this function to create a memory reference expression for
4130    vector load/store access.
4131
4132    Input:
4133    1. STMT: a stmt that references memory. Expected to be of the form
4134          GIMPLE_ASSIGN <name, data-ref> or
4135          GIMPLE_ASSIGN <data-ref, name>.
4136    2. AGGR_TYPE: the type of the reference, which should be either a vector
4137         or an array.
4138    3. AT_LOOP: the loop where the vector memref is to be created.
4139    4. OFFSET (optional): an offset to be added to the initial address accessed
4140         by the data-ref in STMT.
4141    5. BSI: location where the new stmts are to be placed if there is no loop
4142    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4143         pointing to the initial address.
4144    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4145         to the initial address accessed by the data-ref in STMT.  This is
4146         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4147         in bytes.
4148
4149    Output:
4150    1. Declare a new ptr to vector_type, and have it point to the base of the
4151       data reference (initial addressed accessed by the data reference).
4152       For example, for vector of type V8HI, the following code is generated:
4153
4154       v8hi *ap;
4155       ap = (v8hi *)initial_address;
4156
4157       if OFFSET is not supplied:
4158          initial_address = &a[init];
4159       if OFFSET is supplied:
4160          initial_address = &a[init + OFFSET];
4161       if BYTE_OFFSET is supplied:
4162          initial_address = &a[init] + BYTE_OFFSET;
4163
4164       Return the initial_address in INITIAL_ADDRESS.
4165
4166    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4167       update the pointer in each iteration of the loop.
4168
4169       Return the increment stmt that updates the pointer in PTR_INCR.
4170
4171    3. Set INV_P to true if the access pattern of the data reference in the
4172       vectorized loop is invariant.  Set it to false otherwise.
4173
4174    4. Return the pointer.  */
4175
4176 tree
4177 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4178                           tree offset, tree *initial_address,
4179                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4180                           bool only_init, bool *inv_p, tree byte_offset)
4181 {
4182   const char *base_name;
4183   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4184   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4185   struct loop *loop = NULL;
4186   bool nested_in_vect_loop = false;
4187   struct loop *containing_loop = NULL;
4188   tree aggr_ptr_type;
4189   tree aggr_ptr;
4190   tree new_temp;
4191   gimple_seq new_stmt_list = NULL;
4192   edge pe = NULL;
4193   basic_block new_bb;
4194   tree aggr_ptr_init;
4195   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4196   tree aptr;
4197   gimple_stmt_iterator incr_gsi;
4198   bool insert_after;
4199   tree indx_before_incr, indx_after_incr;
4200   gimple *incr;
4201   tree step;
4202   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4203
4204   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4205               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4206
4207   if (loop_vinfo)
4208     {
4209       loop = LOOP_VINFO_LOOP (loop_vinfo);
4210       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4211       containing_loop = (gimple_bb (stmt))->loop_father;
4212       pe = loop_preheader_edge (loop);
4213     }
4214   else
4215     {
4216       gcc_assert (bb_vinfo);
4217       only_init = true;
4218       *ptr_incr = NULL;
4219     }
4220
4221   /* Check the step (evolution) of the load in LOOP, and record
4222      whether it's invariant.  */
4223   step = vect_dr_behavior (dr)->step;
4224   if (integer_zerop (step))
4225     *inv_p = true;
4226   else
4227     *inv_p = false;
4228
4229   /* Create an expression for the first address accessed by this load
4230      in LOOP.  */
4231   base_name = get_name (DR_BASE_ADDRESS (dr));
4232
4233   if (dump_enabled_p ())
4234     {
4235       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4236       dump_printf_loc (MSG_NOTE, vect_location,
4237                        "create %s-pointer variable to type: ",
4238                        get_tree_code_name (TREE_CODE (aggr_type)));
4239       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4240       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4241         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4242       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4243         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4244       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4245         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4246       else
4247         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4248       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4249       dump_printf (MSG_NOTE, "\n");
4250     }
4251
4252   /* (1) Create the new aggregate-pointer variable.
4253      Vector and array types inherit the alias set of their component
4254      type by default so we need to use a ref-all pointer if the data
4255      reference does not conflict with the created aggregated data
4256      reference because it is not addressable.  */
4257   bool need_ref_all = false;
4258   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4259                               get_alias_set (DR_REF (dr))))
4260     need_ref_all = true;
4261   /* Likewise for any of the data references in the stmt group.  */
4262   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4263     {
4264       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4265       do
4266         {
4267           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4268           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4269           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4270                                       get_alias_set (DR_REF (sdr))))
4271             {
4272               need_ref_all = true;
4273               break;
4274             }
4275           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4276         }
4277       while (orig_stmt);
4278     }
4279   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4280                                                need_ref_all);
4281   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4282
4283
4284   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4285      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4286      def-use update cycles for the pointer: one relative to the outer-loop
4287      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4288      to the inner-loop (which is the inner-most loop containing the dataref),
4289      and this is done be step (5) below.
4290
4291      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4292      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4293      redundant.  Steps (3),(4) create the following:
4294
4295         vp0 = &base_addr;
4296         LOOP:   vp1 = phi(vp0,vp2)
4297                 ...
4298                 ...
4299                 vp2 = vp1 + step
4300                 goto LOOP
4301
4302      If there is an inner-loop nested in loop, then step (5) will also be
4303      applied, and an additional update in the inner-loop will be created:
4304
4305         vp0 = &base_addr;
4306         LOOP:   vp1 = phi(vp0,vp2)
4307                 ...
4308         inner:     vp3 = phi(vp1,vp4)
4309                    vp4 = vp3 + inner_step
4310                    if () goto inner
4311                 ...
4312                 vp2 = vp1 + step
4313                 if () goto LOOP   */
4314
4315   /* (2) Calculate the initial address of the aggregate-pointer, and set
4316      the aggregate-pointer to point to it before the loop.  */
4317
4318   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4319
4320   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4321                                                    offset, byte_offset);
4322   if (new_stmt_list)
4323     {
4324       if (pe)
4325         {
4326           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4327           gcc_assert (!new_bb);
4328         }
4329       else
4330         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4331     }
4332
4333   *initial_address = new_temp;
4334   aggr_ptr_init = new_temp;
4335
4336   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4337      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4338      inner-loop nested in LOOP (during outer-loop vectorization).  */
4339
4340   /* No update in loop is required.  */
4341   if (only_init && (!loop_vinfo || at_loop == loop))
4342     aptr = aggr_ptr_init;
4343   else
4344     {
4345       /* The step of the aggregate pointer is the type size.  */
4346       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4347       /* One exception to the above is when the scalar step of the load in
4348          LOOP is zero. In this case the step here is also zero.  */
4349       if (*inv_p)
4350         iv_step = size_zero_node;
4351       else if (tree_int_cst_sgn (step) == -1)
4352         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4353
4354       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4355
4356       create_iv (aggr_ptr_init,
4357                  fold_convert (aggr_ptr_type, iv_step),
4358                  aggr_ptr, loop, &incr_gsi, insert_after,
4359                  &indx_before_incr, &indx_after_incr);
4360       incr = gsi_stmt (incr_gsi);
4361       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4362
4363       /* Copy the points-to information if it exists. */
4364       if (DR_PTR_INFO (dr))
4365         {
4366           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4367           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4368         }
4369       if (ptr_incr)
4370         *ptr_incr = incr;
4371
4372       aptr = indx_before_incr;
4373     }
4374
4375   if (!nested_in_vect_loop || only_init)
4376     return aptr;
4377
4378
4379   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4380      nested in LOOP, if exists.  */
4381
4382   gcc_assert (nested_in_vect_loop);
4383   if (!only_init)
4384     {
4385       standard_iv_increment_position (containing_loop, &incr_gsi,
4386                                       &insert_after);
4387       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4388                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4389                  &indx_after_incr);
4390       incr = gsi_stmt (incr_gsi);
4391       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4392
4393       /* Copy the points-to information if it exists. */
4394       if (DR_PTR_INFO (dr))
4395         {
4396           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4397           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4398         }
4399       if (ptr_incr)
4400         *ptr_incr = incr;
4401
4402       return indx_before_incr;
4403     }
4404   else
4405     gcc_unreachable ();
4406 }
4407
4408
4409 /* Function bump_vector_ptr
4410
4411    Increment a pointer (to a vector type) by vector-size. If requested,
4412    i.e. if PTR-INCR is given, then also connect the new increment stmt
4413    to the existing def-use update-chain of the pointer, by modifying
4414    the PTR_INCR as illustrated below:
4415
4416    The pointer def-use update-chain before this function:
4417                         DATAREF_PTR = phi (p_0, p_2)
4418                         ....
4419         PTR_INCR:       p_2 = DATAREF_PTR + step
4420
4421    The pointer def-use update-chain after this function:
4422                         DATAREF_PTR = phi (p_0, p_2)
4423                         ....
4424                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4425                         ....
4426         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4427
4428    Input:
4429    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4430                  in the loop.
4431    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4432               the loop.  The increment amount across iterations is expected
4433               to be vector_size.
4434    BSI - location where the new update stmt is to be placed.
4435    STMT - the original scalar memory-access stmt that is being vectorized.
4436    BUMP - optional. The offset by which to bump the pointer. If not given,
4437           the offset is assumed to be vector_size.
4438
4439    Output: Return NEW_DATAREF_PTR as illustrated above.
4440
4441 */
4442
4443 tree
4444 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4445                  gimple *stmt, tree bump)
4446 {
4447   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4448   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4449   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4450   tree update = TYPE_SIZE_UNIT (vectype);
4451   gassign *incr_stmt;
4452   ssa_op_iter iter;
4453   use_operand_p use_p;
4454   tree new_dataref_ptr;
4455
4456   if (bump)
4457     update = bump;
4458
4459   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4460     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4461   else
4462     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4463   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4464                                    dataref_ptr, update);
4465   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4466
4467   /* Copy the points-to information if it exists. */
4468   if (DR_PTR_INFO (dr))
4469     {
4470       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4471       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4472     }
4473
4474   if (!ptr_incr)
4475     return new_dataref_ptr;
4476
4477   /* Update the vector-pointer's cross-iteration increment.  */
4478   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4479     {
4480       tree use = USE_FROM_PTR (use_p);
4481
4482       if (use == dataref_ptr)
4483         SET_USE (use_p, new_dataref_ptr);
4484       else
4485         gcc_assert (tree_int_cst_compare (use, update) == 0);
4486     }
4487
4488   return new_dataref_ptr;
4489 }
4490
4491
4492 /* Function vect_create_destination_var.
4493
4494    Create a new temporary of type VECTYPE.  */
4495
4496 tree
4497 vect_create_destination_var (tree scalar_dest, tree vectype)
4498 {
4499   tree vec_dest;
4500   const char *name;
4501   char *new_name;
4502   tree type;
4503   enum vect_var_kind kind;
4504
4505   kind = vectype
4506     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4507     ? vect_mask_var
4508     : vect_simple_var
4509     : vect_scalar_var;
4510   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4511
4512   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4513
4514   name = get_name (scalar_dest);
4515   if (name)
4516     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4517   else
4518     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4519   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4520   free (new_name);
4521
4522   return vec_dest;
4523 }
4524
4525 /* Function vect_grouped_store_supported.
4526
4527    Returns TRUE if interleave high and interleave low permutations
4528    are supported, and FALSE otherwise.  */
4529
4530 bool
4531 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4532 {
4533   machine_mode mode = TYPE_MODE (vectype);
4534
4535   /* vect_permute_store_chain requires the group size to be equal to 3 or
4536      be a power of two.  */
4537   if (count != 3 && exact_log2 (count) == -1)
4538     {
4539       if (dump_enabled_p ())
4540         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4541                          "the size of the group of accesses"
4542                          " is not a power of 2 or not eqaul to 3\n");
4543       return false;
4544     }
4545
4546   /* Check that the permutation is supported.  */
4547   if (VECTOR_MODE_P (mode))
4548     {
4549       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4550       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4551
4552       if (count == 3)
4553         {
4554           unsigned int j0 = 0, j1 = 0, j2 = 0;
4555           unsigned int i, j;
4556
4557           for (j = 0; j < 3; j++)
4558             {
4559               int nelt0 = ((3 - j) * nelt) % 3;
4560               int nelt1 = ((3 - j) * nelt + 1) % 3;
4561               int nelt2 = ((3 - j) * nelt + 2) % 3;
4562               for (i = 0; i < nelt; i++)
4563                 {
4564                   if (3 * i + nelt0 < nelt)
4565                     sel[3 * i + nelt0] = j0++;
4566                   if (3 * i + nelt1 < nelt)
4567                     sel[3 * i + nelt1] = nelt + j1++;
4568                   if (3 * i + nelt2 < nelt)
4569                     sel[3 * i + nelt2] = 0;
4570                 }
4571               if (!can_vec_perm_p (mode, false, sel))
4572                 {
4573                   if (dump_enabled_p ())
4574                     dump_printf (MSG_MISSED_OPTIMIZATION,
4575                                  "permutaion op not supported by target.\n");
4576                   return false;
4577                 }
4578
4579               for (i = 0; i < nelt; i++)
4580                 {
4581                   if (3 * i + nelt0 < nelt)
4582                     sel[3 * i + nelt0] = 3 * i + nelt0;
4583                   if (3 * i + nelt1 < nelt)
4584                     sel[3 * i + nelt1] = 3 * i + nelt1;
4585                   if (3 * i + nelt2 < nelt)
4586                     sel[3 * i + nelt2] = nelt + j2++;
4587                 }
4588               if (!can_vec_perm_p (mode, false, sel))
4589                 {
4590                   if (dump_enabled_p ())
4591                     dump_printf (MSG_MISSED_OPTIMIZATION,
4592                                  "permutaion op not supported by target.\n");
4593                   return false;
4594                 }
4595             }
4596           return true;
4597         }
4598       else
4599         {
4600           /* If length is not equal to 3 then only power of 2 is supported.  */
4601           gcc_assert (pow2p_hwi (count));
4602
4603           for (i = 0; i < nelt / 2; i++)
4604             {
4605               sel[i * 2] = i;
4606               sel[i * 2 + 1] = i + nelt;
4607             }
4608             if (can_vec_perm_p (mode, false, sel))
4609               {
4610                 for (i = 0; i < nelt; i++)
4611                   sel[i] += nelt / 2;
4612                 if (can_vec_perm_p (mode, false, sel))
4613                   return true;
4614               }
4615         }
4616     }
4617
4618   if (dump_enabled_p ())
4619     dump_printf (MSG_MISSED_OPTIMIZATION,
4620                  "permutaion op not supported by target.\n");
4621   return false;
4622 }
4623
4624
4625 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4626    type VECTYPE.  */
4627
4628 bool
4629 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4630 {
4631   return vect_lanes_optab_supported_p ("vec_store_lanes",
4632                                        vec_store_lanes_optab,
4633                                        vectype, count);
4634 }
4635
4636
4637 /* Function vect_permute_store_chain.
4638
4639    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4640    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4641    the data correctly for the stores.  Return the final references for stores
4642    in RESULT_CHAIN.
4643
4644    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4645    The input is 4 vectors each containing 8 elements.  We assign a number to
4646    each element, the input sequence is:
4647
4648    1st vec:   0  1  2  3  4  5  6  7
4649    2nd vec:   8  9 10 11 12 13 14 15
4650    3rd vec:  16 17 18 19 20 21 22 23
4651    4th vec:  24 25 26 27 28 29 30 31
4652
4653    The output sequence should be:
4654
4655    1st vec:  0  8 16 24  1  9 17 25
4656    2nd vec:  2 10 18 26  3 11 19 27
4657    3rd vec:  4 12 20 28  5 13 21 30
4658    4th vec:  6 14 22 30  7 15 23 31
4659
4660    i.e., we interleave the contents of the four vectors in their order.
4661
4662    We use interleave_high/low instructions to create such output.  The input of
4663    each interleave_high/low operation is two vectors:
4664    1st vec    2nd vec
4665    0 1 2 3    4 5 6 7
4666    the even elements of the result vector are obtained left-to-right from the
4667    high/low elements of the first vector.  The odd elements of the result are
4668    obtained left-to-right from the high/low elements of the second vector.
4669    The output of interleave_high will be:   0 4 1 5
4670    and of interleave_low:                   2 6 3 7
4671
4672
4673    The permutation is done in log LENGTH stages.  In each stage interleave_high
4674    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4675    where the first argument is taken from the first half of DR_CHAIN and the
4676    second argument from it's second half.
4677    In our example,
4678
4679    I1: interleave_high (1st vec, 3rd vec)
4680    I2: interleave_low (1st vec, 3rd vec)
4681    I3: interleave_high (2nd vec, 4th vec)
4682    I4: interleave_low (2nd vec, 4th vec)
4683
4684    The output for the first stage is:
4685
4686    I1:  0 16  1 17  2 18  3 19
4687    I2:  4 20  5 21  6 22  7 23
4688    I3:  8 24  9 25 10 26 11 27
4689    I4: 12 28 13 29 14 30 15 31
4690
4691    The output of the second stage, i.e. the final result is:
4692
4693    I1:  0  8 16 24  1  9 17 25
4694    I2:  2 10 18 26  3 11 19 27
4695    I3:  4 12 20 28  5 13 21 30
4696    I4:  6 14 22 30  7 15 23 31.  */
4697
4698 void
4699 vect_permute_store_chain (vec<tree> dr_chain,
4700                           unsigned int length,
4701                           gimple *stmt,
4702                           gimple_stmt_iterator *gsi,
4703                           vec<tree> *result_chain)
4704 {
4705   tree vect1, vect2, high, low;
4706   gimple *perm_stmt;
4707   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4708   tree perm_mask_low, perm_mask_high;
4709   tree data_ref;
4710   tree perm3_mask_low, perm3_mask_high;
4711   unsigned int i, n, log_length = exact_log2 (length);
4712   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4713   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4714
4715   result_chain->quick_grow (length);
4716   memcpy (result_chain->address (), dr_chain.address (),
4717           length * sizeof (tree));
4718
4719   if (length == 3)
4720     {
4721       unsigned int j0 = 0, j1 = 0, j2 = 0;
4722
4723       for (j = 0; j < 3; j++)
4724         {
4725           int nelt0 = ((3 - j) * nelt) % 3;
4726           int nelt1 = ((3 - j) * nelt + 1) % 3;
4727           int nelt2 = ((3 - j) * nelt + 2) % 3;
4728
4729           for (i = 0; i < nelt; i++)
4730             {
4731               if (3 * i + nelt0 < nelt)
4732                 sel[3 * i + nelt0] = j0++;
4733               if (3 * i + nelt1 < nelt)
4734                 sel[3 * i + nelt1] = nelt + j1++;
4735               if (3 * i + nelt2 < nelt)
4736                 sel[3 * i + nelt2] = 0;
4737             }
4738           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4739
4740           for (i = 0; i < nelt; i++)
4741             {
4742               if (3 * i + nelt0 < nelt)
4743                 sel[3 * i + nelt0] = 3 * i + nelt0;
4744               if (3 * i + nelt1 < nelt)
4745                 sel[3 * i + nelt1] = 3 * i + nelt1;
4746               if (3 * i + nelt2 < nelt)
4747                 sel[3 * i + nelt2] = nelt + j2++;
4748             }
4749           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4750
4751           vect1 = dr_chain[0];
4752           vect2 = dr_chain[1];
4753
4754           /* Create interleaving stmt:
4755              low = VEC_PERM_EXPR <vect1, vect2,
4756                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4757                                    j + 2, nelt + j + 2, *, ...}>  */
4758           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4759           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4760                                            vect2, perm3_mask_low);
4761           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4762
4763           vect1 = data_ref;
4764           vect2 = dr_chain[2];
4765           /* Create interleaving stmt:
4766              low = VEC_PERM_EXPR <vect1, vect2,
4767                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4768                                    6, 7, nelt + j + 2, ...}>  */
4769           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4770           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4771                                            vect2, perm3_mask_high);
4772           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4773           (*result_chain)[j] = data_ref;
4774         }
4775     }
4776   else
4777     {
4778       /* If length is not equal to 3 then only power of 2 is supported.  */
4779       gcc_assert (pow2p_hwi (length));
4780
4781       for (i = 0, n = nelt / 2; i < n; i++)
4782         {
4783           sel[i * 2] = i;
4784           sel[i * 2 + 1] = i + nelt;
4785         }
4786         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4787
4788         for (i = 0; i < nelt; i++)
4789           sel[i] += nelt / 2;
4790         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4791
4792         for (i = 0, n = log_length; i < n; i++)
4793           {
4794             for (j = 0; j < length/2; j++)
4795               {
4796                 vect1 = dr_chain[j];
4797                 vect2 = dr_chain[j+length/2];
4798
4799                 /* Create interleaving stmt:
4800                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4801                                                         ...}>  */
4802                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4803                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4804                                                  vect2, perm_mask_high);
4805                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4806                 (*result_chain)[2*j] = high;
4807
4808                 /* Create interleaving stmt:
4809                    low = VEC_PERM_EXPR <vect1, vect2,
4810                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4811                                          ...}>  */
4812                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4813                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4814                                                  vect2, perm_mask_low);
4815                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4816                 (*result_chain)[2*j+1] = low;
4817               }
4818             memcpy (dr_chain.address (), result_chain->address (),
4819                     length * sizeof (tree));
4820           }
4821     }
4822 }
4823
4824 /* Function vect_setup_realignment
4825
4826    This function is called when vectorizing an unaligned load using
4827    the dr_explicit_realign[_optimized] scheme.
4828    This function generates the following code at the loop prolog:
4829
4830       p = initial_addr;
4831    x  msq_init = *(floor(p));   # prolog load
4832       realignment_token = call target_builtin;
4833     loop:
4834    x  msq = phi (msq_init, ---)
4835
4836    The stmts marked with x are generated only for the case of
4837    dr_explicit_realign_optimized.
4838
4839    The code above sets up a new (vector) pointer, pointing to the first
4840    location accessed by STMT, and a "floor-aligned" load using that pointer.
4841    It also generates code to compute the "realignment-token" (if the relevant
4842    target hook was defined), and creates a phi-node at the loop-header bb
4843    whose arguments are the result of the prolog-load (created by this
4844    function) and the result of a load that takes place in the loop (to be
4845    created by the caller to this function).
4846
4847    For the case of dr_explicit_realign_optimized:
4848    The caller to this function uses the phi-result (msq) to create the
4849    realignment code inside the loop, and sets up the missing phi argument,
4850    as follows:
4851     loop:
4852       msq = phi (msq_init, lsq)
4853       lsq = *(floor(p'));        # load in loop
4854       result = realign_load (msq, lsq, realignment_token);
4855
4856    For the case of dr_explicit_realign:
4857     loop:
4858       msq = *(floor(p));        # load in loop
4859       p' = p + (VS-1);
4860       lsq = *(floor(p'));       # load in loop
4861       result = realign_load (msq, lsq, realignment_token);
4862
4863    Input:
4864    STMT - (scalar) load stmt to be vectorized. This load accesses
4865           a memory location that may be unaligned.
4866    BSI - place where new code is to be inserted.
4867    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4868                               is used.
4869
4870    Output:
4871    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4872                        target hook, if defined.
4873    Return value - the result of the loop-header phi node.  */
4874
4875 tree
4876 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4877                         tree *realignment_token,
4878                         enum dr_alignment_support alignment_support_scheme,
4879                         tree init_addr,
4880                         struct loop **at_loop)
4881 {
4882   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4883   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4884   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4885   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4886   struct loop *loop = NULL;
4887   edge pe = NULL;
4888   tree scalar_dest = gimple_assign_lhs (stmt);
4889   tree vec_dest;
4890   gimple *inc;
4891   tree ptr;
4892   tree data_ref;
4893   basic_block new_bb;
4894   tree msq_init = NULL_TREE;
4895   tree new_temp;
4896   gphi *phi_stmt;
4897   tree msq = NULL_TREE;
4898   gimple_seq stmts = NULL;
4899   bool inv_p;
4900   bool compute_in_loop = false;
4901   bool nested_in_vect_loop = false;
4902   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4903   struct loop *loop_for_initial_load = NULL;
4904
4905   if (loop_vinfo)
4906     {
4907       loop = LOOP_VINFO_LOOP (loop_vinfo);
4908       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4909     }
4910
4911   gcc_assert (alignment_support_scheme == dr_explicit_realign
4912               || alignment_support_scheme == dr_explicit_realign_optimized);
4913
4914   /* We need to generate three things:
4915      1. the misalignment computation
4916      2. the extra vector load (for the optimized realignment scheme).
4917      3. the phi node for the two vectors from which the realignment is
4918       done (for the optimized realignment scheme).  */
4919
4920   /* 1. Determine where to generate the misalignment computation.
4921
4922      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4923      calculation will be generated by this function, outside the loop (in the
4924      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4925      caller, inside the loop.
4926
4927      Background: If the misalignment remains fixed throughout the iterations of
4928      the loop, then both realignment schemes are applicable, and also the
4929      misalignment computation can be done outside LOOP.  This is because we are
4930      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4931      are a multiple of VS (the Vector Size), and therefore the misalignment in
4932      different vectorized LOOP iterations is always the same.
4933      The problem arises only if the memory access is in an inner-loop nested
4934      inside LOOP, which is now being vectorized using outer-loop vectorization.
4935      This is the only case when the misalignment of the memory access may not
4936      remain fixed throughout the iterations of the inner-loop (as explained in
4937      detail in vect_supportable_dr_alignment).  In this case, not only is the
4938      optimized realignment scheme not applicable, but also the misalignment
4939      computation (and generation of the realignment token that is passed to
4940      REALIGN_LOAD) have to be done inside the loop.
4941
4942      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4943      or not, which in turn determines if the misalignment is computed inside
4944      the inner-loop, or outside LOOP.  */
4945
4946   if (init_addr != NULL_TREE || !loop_vinfo)
4947     {
4948       compute_in_loop = true;
4949       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4950     }
4951
4952
4953   /* 2. Determine where to generate the extra vector load.
4954
4955      For the optimized realignment scheme, instead of generating two vector
4956      loads in each iteration, we generate a single extra vector load in the
4957      preheader of the loop, and in each iteration reuse the result of the
4958      vector load from the previous iteration.  In case the memory access is in
4959      an inner-loop nested inside LOOP, which is now being vectorized using
4960      outer-loop vectorization, we need to determine whether this initial vector
4961      load should be generated at the preheader of the inner-loop, or can be
4962      generated at the preheader of LOOP.  If the memory access has no evolution
4963      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4964      to be generated inside LOOP (in the preheader of the inner-loop).  */
4965
4966   if (nested_in_vect_loop)
4967     {
4968       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4969       bool invariant_in_outerloop =
4970             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4971       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4972     }
4973   else
4974     loop_for_initial_load = loop;
4975   if (at_loop)
4976     *at_loop = loop_for_initial_load;
4977
4978   if (loop_for_initial_load)
4979     pe = loop_preheader_edge (loop_for_initial_load);
4980
4981   /* 3. For the case of the optimized realignment, create the first vector
4982       load at the loop preheader.  */
4983
4984   if (alignment_support_scheme == dr_explicit_realign_optimized)
4985     {
4986       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4987       gassign *new_stmt;
4988
4989       gcc_assert (!compute_in_loop);
4990       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4991       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4992                                       NULL_TREE, &init_addr, NULL, &inc,
4993                                       true, &inv_p);
4994       if (TREE_CODE (ptr) == SSA_NAME)
4995         new_temp = copy_ssa_name (ptr);
4996       else
4997         new_temp = make_ssa_name (TREE_TYPE (ptr));
4998       new_stmt = gimple_build_assign
4999                    (new_temp, BIT_AND_EXPR, ptr,
5000                     build_int_cst (TREE_TYPE (ptr),
5001                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5002       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5003       gcc_assert (!new_bb);
5004       data_ref
5005         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5006                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5007       new_stmt = gimple_build_assign (vec_dest, data_ref);
5008       new_temp = make_ssa_name (vec_dest, new_stmt);
5009       gimple_assign_set_lhs (new_stmt, new_temp);
5010       if (pe)
5011         {
5012           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5013           gcc_assert (!new_bb);
5014         }
5015       else
5016          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5017
5018       msq_init = gimple_assign_lhs (new_stmt);
5019     }
5020
5021   /* 4. Create realignment token using a target builtin, if available.
5022       It is done either inside the containing loop, or before LOOP (as
5023       determined above).  */
5024
5025   if (targetm.vectorize.builtin_mask_for_load)
5026     {
5027       gcall *new_stmt;
5028       tree builtin_decl;
5029
5030       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5031       if (!init_addr)
5032         {
5033           /* Generate the INIT_ADDR computation outside LOOP.  */
5034           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5035                                                             NULL_TREE);
5036           if (loop)
5037             {
5038               pe = loop_preheader_edge (loop);
5039               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5040               gcc_assert (!new_bb);
5041             }
5042           else
5043              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5044         }
5045
5046       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5047       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5048       vec_dest =
5049         vect_create_destination_var (scalar_dest,
5050                                      gimple_call_return_type (new_stmt));
5051       new_temp = make_ssa_name (vec_dest, new_stmt);
5052       gimple_call_set_lhs (new_stmt, new_temp);
5053
5054       if (compute_in_loop)
5055         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5056       else
5057         {
5058           /* Generate the misalignment computation outside LOOP.  */
5059           pe = loop_preheader_edge (loop);
5060           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5061           gcc_assert (!new_bb);
5062         }
5063
5064       *realignment_token = gimple_call_lhs (new_stmt);
5065
5066       /* The result of the CALL_EXPR to this builtin is determined from
5067          the value of the parameter and no global variables are touched
5068          which makes the builtin a "const" function.  Requiring the
5069          builtin to have the "const" attribute makes it unnecessary
5070          to call mark_call_clobbered.  */
5071       gcc_assert (TREE_READONLY (builtin_decl));
5072     }
5073
5074   if (alignment_support_scheme == dr_explicit_realign)
5075     return msq;
5076
5077   gcc_assert (!compute_in_loop);
5078   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5079
5080
5081   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5082
5083   pe = loop_preheader_edge (containing_loop);
5084   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5085   msq = make_ssa_name (vec_dest);
5086   phi_stmt = create_phi_node (msq, containing_loop->header);
5087   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5088
5089   return msq;
5090 }
5091
5092
5093 /* Function vect_grouped_load_supported.
5094
5095    COUNT is the size of the load group (the number of statements plus the
5096    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5097    only one statement, with a gap of COUNT - 1.
5098
5099    Returns true if a suitable permute exists.  */
5100
5101 bool
5102 vect_grouped_load_supported (tree vectype, bool single_element_p,
5103                              unsigned HOST_WIDE_INT count)
5104 {
5105   machine_mode mode = TYPE_MODE (vectype);
5106
5107   /* If this is single-element interleaving with an element distance
5108      that leaves unused vector loads around punt - we at least create
5109      very sub-optimal code in that case (and blow up memory,
5110      see PR65518).  */
5111   if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5112     {
5113       if (dump_enabled_p ())
5114         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5115                          "single-element interleaving not supported "
5116                          "for not adjacent vector loads\n");
5117       return false;
5118     }
5119
5120   /* vect_permute_load_chain requires the group size to be equal to 3 or
5121      be a power of two.  */
5122   if (count != 3 && exact_log2 (count) == -1)
5123     {
5124       if (dump_enabled_p ())
5125         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5126                          "the size of the group of accesses"
5127                          " is not a power of 2 or not equal to 3\n");
5128       return false;
5129     }
5130
5131   /* Check that the permutation is supported.  */
5132   if (VECTOR_MODE_P (mode))
5133     {
5134       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5135       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5136
5137       if (count == 3)
5138         {
5139           unsigned int k;
5140           for (k = 0; k < 3; k++)
5141             {
5142               for (i = 0; i < nelt; i++)
5143                 if (3 * i + k < 2 * nelt)
5144                   sel[i] = 3 * i + k;
5145                 else
5146                   sel[i] = 0;
5147               if (!can_vec_perm_p (mode, false, sel))
5148                 {
5149                   if (dump_enabled_p ())
5150                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5151                                      "shuffle of 3 loads is not supported by"
5152                                      " target\n");
5153                   return false;
5154                 }
5155               for (i = 0, j = 0; i < nelt; i++)
5156                 if (3 * i + k < 2 * nelt)
5157                   sel[i] = i;
5158                 else
5159                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5160               if (!can_vec_perm_p (mode, false, sel))
5161                 {
5162                   if (dump_enabled_p ())
5163                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5164                                      "shuffle of 3 loads is not supported by"
5165                                      " target\n");
5166                   return false;
5167                 }
5168             }
5169           return true;
5170         }
5171       else
5172         {
5173           /* If length is not equal to 3 then only power of 2 is supported.  */
5174           gcc_assert (pow2p_hwi (count));
5175           for (i = 0; i < nelt; i++)
5176             sel[i] = i * 2;
5177           if (can_vec_perm_p (mode, false, sel))
5178             {
5179               for (i = 0; i < nelt; i++)
5180                 sel[i] = i * 2 + 1;
5181               if (can_vec_perm_p (mode, false, sel))
5182                 return true;
5183             }
5184         }
5185     }
5186
5187   if (dump_enabled_p ())
5188     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5189                      "extract even/odd not supported by target\n");
5190   return false;
5191 }
5192
5193 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5194    type VECTYPE.  */
5195
5196 bool
5197 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5198 {
5199   return vect_lanes_optab_supported_p ("vec_load_lanes",
5200                                        vec_load_lanes_optab,
5201                                        vectype, count);
5202 }
5203
5204 /* Function vect_permute_load_chain.
5205
5206    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5207    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5208    the input data correctly.  Return the final references for loads in
5209    RESULT_CHAIN.
5210
5211    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5212    The input is 4 vectors each containing 8 elements. We assign a number to each
5213    element, the input sequence is:
5214
5215    1st vec:   0  1  2  3  4  5  6  7
5216    2nd vec:   8  9 10 11 12 13 14 15
5217    3rd vec:  16 17 18 19 20 21 22 23
5218    4th vec:  24 25 26 27 28 29 30 31
5219
5220    The output sequence should be:
5221
5222    1st vec:  0 4  8 12 16 20 24 28
5223    2nd vec:  1 5  9 13 17 21 25 29
5224    3rd vec:  2 6 10 14 18 22 26 30
5225    4th vec:  3 7 11 15 19 23 27 31
5226
5227    i.e., the first output vector should contain the first elements of each
5228    interleaving group, etc.
5229
5230    We use extract_even/odd instructions to create such output.  The input of
5231    each extract_even/odd operation is two vectors
5232    1st vec    2nd vec
5233    0 1 2 3    4 5 6 7
5234
5235    and the output is the vector of extracted even/odd elements.  The output of
5236    extract_even will be:   0 2 4 6
5237    and of extract_odd:     1 3 5 7
5238
5239
5240    The permutation is done in log LENGTH stages.  In each stage extract_even
5241    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5242    their order.  In our example,
5243
5244    E1: extract_even (1st vec, 2nd vec)
5245    E2: extract_odd (1st vec, 2nd vec)
5246    E3: extract_even (3rd vec, 4th vec)
5247    E4: extract_odd (3rd vec, 4th vec)
5248
5249    The output for the first stage will be:
5250
5251    E1:  0  2  4  6  8 10 12 14
5252    E2:  1  3  5  7  9 11 13 15
5253    E3: 16 18 20 22 24 26 28 30
5254    E4: 17 19 21 23 25 27 29 31
5255
5256    In order to proceed and create the correct sequence for the next stage (or
5257    for the correct output, if the second stage is the last one, as in our
5258    example), we first put the output of extract_even operation and then the
5259    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5260    The input for the second stage is:
5261
5262    1st vec (E1):  0  2  4  6  8 10 12 14
5263    2nd vec (E3): 16 18 20 22 24 26 28 30
5264    3rd vec (E2):  1  3  5  7  9 11 13 15
5265    4th vec (E4): 17 19 21 23 25 27 29 31
5266
5267    The output of the second stage:
5268
5269    E1: 0 4  8 12 16 20 24 28
5270    E2: 2 6 10 14 18 22 26 30
5271    E3: 1 5  9 13 17 21 25 29
5272    E4: 3 7 11 15 19 23 27 31
5273
5274    And RESULT_CHAIN after reordering:
5275
5276    1st vec (E1):  0 4  8 12 16 20 24 28
5277    2nd vec (E3):  1 5  9 13 17 21 25 29
5278    3rd vec (E2):  2 6 10 14 18 22 26 30
5279    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5280
5281 static void
5282 vect_permute_load_chain (vec<tree> dr_chain,
5283                          unsigned int length,
5284                          gimple *stmt,
5285                          gimple_stmt_iterator *gsi,
5286                          vec<tree> *result_chain)
5287 {
5288   tree data_ref, first_vect, second_vect;
5289   tree perm_mask_even, perm_mask_odd;
5290   tree perm3_mask_low, perm3_mask_high;
5291   gimple *perm_stmt;
5292   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5293   unsigned int i, j, log_length = exact_log2 (length);
5294   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5295   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5296
5297   result_chain->quick_grow (length);
5298   memcpy (result_chain->address (), dr_chain.address (),
5299           length * sizeof (tree));
5300
5301   if (length == 3)
5302     {
5303       unsigned int k;
5304
5305       for (k = 0; k < 3; k++)
5306         {
5307           for (i = 0; i < nelt; i++)
5308             if (3 * i + k < 2 * nelt)
5309               sel[i] = 3 * i + k;
5310             else
5311               sel[i] = 0;
5312           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5313
5314           for (i = 0, j = 0; i < nelt; i++)
5315             if (3 * i + k < 2 * nelt)
5316               sel[i] = i;
5317             else
5318               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5319
5320           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5321
5322           first_vect = dr_chain[0];
5323           second_vect = dr_chain[1];
5324
5325           /* Create interleaving stmt (low part of):
5326              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5327                                                              ...}>  */
5328           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5329           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5330                                            second_vect, perm3_mask_low);
5331           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5332
5333           /* Create interleaving stmt (high part of):
5334              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5335                                                               ...}>  */
5336           first_vect = data_ref;
5337           second_vect = dr_chain[2];
5338           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5339           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5340                                            second_vect, perm3_mask_high);
5341           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5342           (*result_chain)[k] = data_ref;
5343         }
5344     }
5345   else
5346     {
5347       /* If length is not equal to 3 then only power of 2 is supported.  */
5348       gcc_assert (pow2p_hwi (length));
5349
5350       for (i = 0; i < nelt; ++i)
5351         sel[i] = i * 2;
5352       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5353
5354       for (i = 0; i < nelt; ++i)
5355         sel[i] = i * 2 + 1;
5356       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5357
5358       for (i = 0; i < log_length; i++)
5359         {
5360           for (j = 0; j < length; j += 2)
5361             {
5362               first_vect = dr_chain[j];
5363               second_vect = dr_chain[j+1];
5364
5365               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5366               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5367               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5368                                                first_vect, second_vect,
5369                                                perm_mask_even);
5370               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5371               (*result_chain)[j/2] = data_ref;
5372
5373               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5374               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5375               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5376                                                first_vect, second_vect,
5377                                                perm_mask_odd);
5378               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5379               (*result_chain)[j/2+length/2] = data_ref;
5380             }
5381           memcpy (dr_chain.address (), result_chain->address (),
5382                   length * sizeof (tree));
5383         }
5384     }
5385 }
5386
5387 /* Function vect_shift_permute_load_chain.
5388
5389    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5390    sequence of stmts to reorder the input data accordingly.
5391    Return the final references for loads in RESULT_CHAIN.
5392    Return true if successed, false otherwise.
5393
5394    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5395    The input is 3 vectors each containing 8 elements.  We assign a
5396    number to each element, the input sequence is:
5397
5398    1st vec:   0  1  2  3  4  5  6  7
5399    2nd vec:   8  9 10 11 12 13 14 15
5400    3rd vec:  16 17 18 19 20 21 22 23
5401
5402    The output sequence should be:
5403
5404    1st vec:  0 3 6  9 12 15 18 21
5405    2nd vec:  1 4 7 10 13 16 19 22
5406    3rd vec:  2 5 8 11 14 17 20 23
5407
5408    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5409
5410    First we shuffle all 3 vectors to get correct elements order:
5411
5412    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5413    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5414    3rd vec:  (16 19 22) (17 20 23) (18 21)
5415
5416    Next we unite and shift vector 3 times:
5417
5418    1st step:
5419      shift right by 6 the concatenation of:
5420      "1st vec" and  "2nd vec"
5421        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5422      "2nd vec" and  "3rd vec"
5423        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5424      "3rd vec" and  "1st vec"
5425        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5426                              | New vectors                   |
5427
5428      So that now new vectors are:
5429
5430      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5431      2nd vec:  (10 13) (16 19 22) (17 20 23)
5432      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5433
5434    2nd step:
5435      shift right by 5 the concatenation of:
5436      "1st vec" and  "3rd vec"
5437        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5438      "2nd vec" and  "1st vec"
5439        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5440      "3rd vec" and  "2nd vec"
5441        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5442                           | New vectors                   |
5443
5444      So that now new vectors are:
5445
5446      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5447      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5448      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5449
5450    3rd step:
5451      shift right by 5 the concatenation of:
5452      "1st vec" and  "1st vec"
5453        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5454      shift right by 3 the concatenation of:
5455      "2nd vec" and  "2nd vec"
5456                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5457                           | New vectors                   |
5458
5459      So that now all vectors are READY:
5460      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5461      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5462      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5463
5464    This algorithm is faster than one in vect_permute_load_chain if:
5465      1.  "shift of a concatination" is faster than general permutation.
5466          This is usually so.
5467      2.  The TARGET machine can't execute vector instructions in parallel.
5468          This is because each step of the algorithm depends on previous.
5469          The algorithm in vect_permute_load_chain is much more parallel.
5470
5471    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5472 */
5473
5474 static bool
5475 vect_shift_permute_load_chain (vec<tree> dr_chain,
5476                                unsigned int length,
5477                                gimple *stmt,
5478                                gimple_stmt_iterator *gsi,
5479                                vec<tree> *result_chain)
5480 {
5481   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5482   tree perm2_mask1, perm2_mask2, perm3_mask;
5483   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5484   gimple *perm_stmt;
5485
5486   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5487   unsigned int i;
5488   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5489   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5490   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5491   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5492
5493   result_chain->quick_grow (length);
5494   memcpy (result_chain->address (), dr_chain.address (),
5495           length * sizeof (tree));
5496
5497   if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5498     {
5499       unsigned int j, log_length = exact_log2 (length);
5500       for (i = 0; i < nelt / 2; ++i)
5501         sel[i] = i * 2;
5502       for (i = 0; i < nelt / 2; ++i)
5503         sel[nelt / 2 + i] = i * 2 + 1;
5504       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5505         {
5506           if (dump_enabled_p ())
5507             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5508                              "shuffle of 2 fields structure is not \
5509                               supported by target\n");
5510           return false;
5511         }
5512       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5513
5514       for (i = 0; i < nelt / 2; ++i)
5515         sel[i] = i * 2 + 1;
5516       for (i = 0; i < nelt / 2; ++i)
5517         sel[nelt / 2 + i] = i * 2;
5518       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5519         {
5520           if (dump_enabled_p ())
5521             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522                              "shuffle of 2 fields structure is not \
5523                               supported by target\n");
5524           return false;
5525         }
5526       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5527
5528       /* Generating permutation constant to shift all elements.
5529          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5530       for (i = 0; i < nelt; i++)
5531         sel[i] = nelt / 2 + i;
5532       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5533         {
5534           if (dump_enabled_p ())
5535             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5536                              "shift permutation is not supported by target\n");
5537           return false;
5538         }
5539       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5540
5541       /* Generating permutation constant to select vector from 2.
5542          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5543       for (i = 0; i < nelt / 2; i++)
5544         sel[i] = i;
5545       for (i = nelt / 2; i < nelt; i++)
5546         sel[i] = nelt + i;
5547       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5548         {
5549           if (dump_enabled_p ())
5550             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5551                              "select is not supported by target\n");
5552           return false;
5553         }
5554       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5555
5556       for (i = 0; i < log_length; i++)
5557         {
5558           for (j = 0; j < length; j += 2)
5559             {
5560               first_vect = dr_chain[j];
5561               second_vect = dr_chain[j + 1];
5562
5563               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5564               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5565                                                first_vect, first_vect,
5566                                                perm2_mask1);
5567               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5568               vect[0] = data_ref;
5569
5570               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5571               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5572                                                second_vect, second_vect,
5573                                                perm2_mask2);
5574               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5575               vect[1] = data_ref;
5576
5577               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5578               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5579                                                vect[0], vect[1], shift1_mask);
5580               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5581               (*result_chain)[j/2 + length/2] = data_ref;
5582
5583               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5584               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5585                                                vect[0], vect[1], select_mask);
5586               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5587               (*result_chain)[j/2] = data_ref;
5588             }
5589           memcpy (dr_chain.address (), result_chain->address (),
5590                   length * sizeof (tree));
5591         }
5592       return true;
5593     }
5594   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5595     {
5596       unsigned int k = 0, l = 0;
5597
5598       /* Generating permutation constant to get all elements in rigth order.
5599          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5600       for (i = 0; i < nelt; i++)
5601         {
5602           if (3 * k + (l % 3) >= nelt)
5603             {
5604               k = 0;
5605               l += (3 - (nelt % 3));
5606             }
5607           sel[i] = 3 * k + (l % 3);
5608           k++;
5609         }
5610       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5611         {
5612           if (dump_enabled_p ())
5613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5614                              "shuffle of 3 fields structure is not \
5615                               supported by target\n");
5616           return false;
5617         }
5618       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5619
5620       /* Generating permutation constant to shift all elements.
5621          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5622       for (i = 0; i < nelt; i++)
5623         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5624       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5625         {
5626           if (dump_enabled_p ())
5627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5628                              "shift permutation is not supported by target\n");
5629           return false;
5630         }
5631       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5632
5633       /* Generating permutation constant to shift all elements.
5634          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5635       for (i = 0; i < nelt; i++)
5636         sel[i] = 2 * (nelt / 3) + 1 + i;
5637       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5638         {
5639           if (dump_enabled_p ())
5640             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5641                              "shift permutation is not supported by target\n");
5642           return false;
5643         }
5644       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5645
5646       /* Generating permutation constant to shift all elements.
5647          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5648       for (i = 0; i < nelt; i++)
5649         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5650       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5651         {
5652           if (dump_enabled_p ())
5653             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654                              "shift permutation is not supported by target\n");
5655           return false;
5656         }
5657       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5658
5659       /* Generating permutation constant to shift all elements.
5660          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5661       for (i = 0; i < nelt; i++)
5662         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5663       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5664         {
5665           if (dump_enabled_p ())
5666             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5667                              "shift permutation is not supported by target\n");
5668           return false;
5669         }
5670       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5671
5672       for (k = 0; k < 3; k++)
5673         {
5674           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5675           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5676                                            dr_chain[k], dr_chain[k],
5677                                            perm3_mask);
5678           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5679           vect[k] = data_ref;
5680         }
5681
5682       for (k = 0; k < 3; k++)
5683         {
5684           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5685           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5686                                            vect[k % 3], vect[(k + 1) % 3],
5687                                            shift1_mask);
5688           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5689           vect_shift[k] = data_ref;
5690         }
5691
5692       for (k = 0; k < 3; k++)
5693         {
5694           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5695           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5696                                            vect_shift[(4 - k) % 3],
5697                                            vect_shift[(3 - k) % 3],
5698                                            shift2_mask);
5699           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5700           vect[k] = data_ref;
5701         }
5702
5703       (*result_chain)[3 - (nelt % 3)] = vect[2];
5704
5705       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5706       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5707                                        vect[0], shift3_mask);
5708       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5709       (*result_chain)[nelt % 3] = data_ref;
5710
5711       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5712       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5713                                        vect[1], shift4_mask);
5714       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5715       (*result_chain)[0] = data_ref;
5716       return true;
5717     }
5718   return false;
5719 }
5720
5721 /* Function vect_transform_grouped_load.
5722
5723    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5724    to perform their permutation and ascribe the result vectorized statements to
5725    the scalar statements.
5726 */
5727
5728 void
5729 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5730                              gimple_stmt_iterator *gsi)
5731 {
5732   machine_mode mode;
5733   vec<tree> result_chain = vNULL;
5734
5735   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5736      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5737      vectors, that are ready for vector computation.  */
5738   result_chain.create (size);
5739
5740   /* If reassociation width for vector type is 2 or greater target machine can
5741      execute 2 or more vector instructions in parallel.  Otherwise try to
5742      get chain for loads group using vect_shift_permute_load_chain.  */
5743   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5744   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5745       || pow2p_hwi (size)
5746       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5747                                          gsi, &result_chain))
5748     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5749   vect_record_grouped_load_vectors (stmt, result_chain);
5750   result_chain.release ();
5751 }
5752
5753 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5754    generated as part of the vectorization of STMT.  Assign the statement
5755    for each vector to the associated scalar statement.  */
5756
5757 void
5758 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5759 {
5760   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5761   gimple *next_stmt, *new_stmt;
5762   unsigned int i, gap_count;
5763   tree tmp_data_ref;
5764
5765   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5766      Since we scan the chain starting from it's first node, their order
5767      corresponds the order of data-refs in RESULT_CHAIN.  */
5768   next_stmt = first_stmt;
5769   gap_count = 1;
5770   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5771     {
5772       if (!next_stmt)
5773         break;
5774
5775       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5776        code elimination pass later.  No need to check for the first stmt in
5777        the group, since it always exists.
5778        GROUP_GAP is the number of steps in elements from the previous
5779        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5780        correspond to the gaps.  */
5781       if (next_stmt != first_stmt
5782           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5783       {
5784         gap_count++;
5785         continue;
5786       }
5787
5788       while (next_stmt)
5789         {
5790           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5791           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5792              copies, and we put the new vector statement in the first available
5793              RELATED_STMT.  */
5794           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5795             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5796           else
5797             {
5798               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5799                 {
5800                   gimple *prev_stmt =
5801                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5802                   gimple *rel_stmt =
5803                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5804                   while (rel_stmt)
5805                     {
5806                       prev_stmt = rel_stmt;
5807                       rel_stmt =
5808                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5809                     }
5810
5811                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5812                     new_stmt;
5813                 }
5814             }
5815
5816           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5817           gap_count = 1;
5818           /* If NEXT_STMT accesses the same DR as the previous statement,
5819              put the same TMP_DATA_REF as its vectorized statement; otherwise
5820              get the next data-ref from RESULT_CHAIN.  */
5821           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5822             break;
5823         }
5824     }
5825 }
5826
5827 /* Function vect_force_dr_alignment_p.
5828
5829    Returns whether the alignment of a DECL can be forced to be aligned
5830    on ALIGNMENT bit boundary.  */
5831
5832 bool
5833 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5834 {
5835   if (!VAR_P (decl))
5836     return false;
5837
5838   if (decl_in_symtab_p (decl)
5839       && !symtab_node::get (decl)->can_increase_alignment_p ())
5840     return false;
5841
5842   if (TREE_STATIC (decl))
5843     return (alignment <= MAX_OFILE_ALIGNMENT);
5844   else
5845     return (alignment <= MAX_STACK_ALIGNMENT);
5846 }
5847
5848
5849 /* Return whether the data reference DR is supported with respect to its
5850    alignment.
5851    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5852    it is aligned, i.e., check if it is possible to vectorize it with different
5853    alignment.  */
5854
5855 enum dr_alignment_support
5856 vect_supportable_dr_alignment (struct data_reference *dr,
5857                                bool check_aligned_accesses)
5858 {
5859   gimple *stmt = DR_STMT (dr);
5860   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5861   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5862   machine_mode mode = TYPE_MODE (vectype);
5863   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5864   struct loop *vect_loop = NULL;
5865   bool nested_in_vect_loop = false;
5866
5867   if (aligned_access_p (dr) && !check_aligned_accesses)
5868     return dr_aligned;
5869
5870   /* For now assume all conditional loads/stores support unaligned
5871      access without any special code.  */
5872   if (is_gimple_call (stmt)
5873       && gimple_call_internal_p (stmt)
5874       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5875           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5876     return dr_unaligned_supported;
5877
5878   if (loop_vinfo)
5879     {
5880       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5881       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5882     }
5883
5884   /* Possibly unaligned access.  */
5885
5886   /* We can choose between using the implicit realignment scheme (generating
5887      a misaligned_move stmt) and the explicit realignment scheme (generating
5888      aligned loads with a REALIGN_LOAD).  There are two variants to the
5889      explicit realignment scheme: optimized, and unoptimized.
5890      We can optimize the realignment only if the step between consecutive
5891      vector loads is equal to the vector size.  Since the vector memory
5892      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5893      is guaranteed that the misalignment amount remains the same throughout the
5894      execution of the vectorized loop.  Therefore, we can create the
5895      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5896      at the loop preheader.
5897
5898      However, in the case of outer-loop vectorization, when vectorizing a
5899      memory access in the inner-loop nested within the LOOP that is now being
5900      vectorized, while it is guaranteed that the misalignment of the
5901      vectorized memory access will remain the same in different outer-loop
5902      iterations, it is *not* guaranteed that is will remain the same throughout
5903      the execution of the inner-loop.  This is because the inner-loop advances
5904      with the original scalar step (and not in steps of VS).  If the inner-loop
5905      step happens to be a multiple of VS, then the misalignment remains fixed
5906      and we can use the optimized realignment scheme.  For example:
5907
5908       for (i=0; i<N; i++)
5909         for (j=0; j<M; j++)
5910           s += a[i+j];
5911
5912      When vectorizing the i-loop in the above example, the step between
5913      consecutive vector loads is 1, and so the misalignment does not remain
5914      fixed across the execution of the inner-loop, and the realignment cannot
5915      be optimized (as illustrated in the following pseudo vectorized loop):
5916
5917       for (i=0; i<N; i+=4)
5918         for (j=0; j<M; j++){
5919           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5920                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5921                          // (assuming that we start from an aligned address).
5922           }
5923
5924      We therefore have to use the unoptimized realignment scheme:
5925
5926       for (i=0; i<N; i+=4)
5927           for (j=k; j<M; j+=4)
5928           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5929                            // that the misalignment of the initial address is
5930                            // 0).
5931
5932      The loop can then be vectorized as follows:
5933
5934       for (k=0; k<4; k++){
5935         rt = get_realignment_token (&vp[k]);
5936         for (i=0; i<N; i+=4){
5937           v1 = vp[i+k];
5938           for (j=k; j<M; j+=4){
5939             v2 = vp[i+j+VS-1];
5940             va = REALIGN_LOAD <v1,v2,rt>;
5941             vs += va;
5942             v1 = v2;
5943           }
5944         }
5945     } */
5946
5947   if (DR_IS_READ (dr))
5948     {
5949       bool is_packed = false;
5950       tree type = (TREE_TYPE (DR_REF (dr)));
5951
5952       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5953           && (!targetm.vectorize.builtin_mask_for_load
5954               || targetm.vectorize.builtin_mask_for_load ()))
5955         {
5956           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5957
5958           /* If we are doing SLP then the accesses need not have the
5959              same alignment, instead it depends on the SLP group size.  */
5960           if (loop_vinfo
5961               && STMT_SLP_TYPE (stmt_info)
5962               && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5963                   * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5964                   % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5965             ;
5966           else if (!loop_vinfo
5967                    || (nested_in_vect_loop
5968                        && (TREE_INT_CST_LOW (DR_STEP (dr))
5969                            != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5970             return dr_explicit_realign;
5971           else
5972             return dr_explicit_realign_optimized;
5973         }
5974       if (!known_alignment_for_access_p (dr))
5975         is_packed = not_size_aligned (DR_REF (dr));
5976
5977       if (targetm.vectorize.support_vector_misalignment
5978             (mode, type, DR_MISALIGNMENT (dr), is_packed))
5979         /* Can't software pipeline the loads, but can at least do them.  */
5980         return dr_unaligned_supported;
5981     }
5982   else
5983     {
5984       bool is_packed = false;
5985       tree type = (TREE_TYPE (DR_REF (dr)));
5986
5987       if (!known_alignment_for_access_p (dr))
5988         is_packed = not_size_aligned (DR_REF (dr));
5989
5990      if (targetm.vectorize.support_vector_misalignment
5991            (mode, type, DR_MISALIGNMENT (dr), is_packed))
5992        return dr_unaligned_supported;
5993     }
5994
5995   /* Unsupported.  */
5996   return dr_unaligned_unsupported;
5997 }