gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tm_p.h"
  40 #include "target.h"
  41 #include "predict.h"
  42 #include "hard-reg-set.h"
  43 #include "function.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "basic-block.h"
  47 #include "gimple-pretty-print.h"
  48 #include "tree-ssa-alias.h"
  49 #include "internal-fn.h"
  50 #include "tree-eh.h"
  51 #include "gimple-expr.h"
  52 #include "is-a.h"
  53 #include "gimple.h"
  54 #include "gimplify.h"
  55 #include "gimple-iterator.h"
  56 #include "gimplify-me.h"
  57 #include "gimple-ssa.h"
  58 #include "tree-phinodes.h"
  59 #include "ssa-iterators.h"
  60 #include "stringpool.h"
  61 #include "tree-ssanames.h"
  62 #include "tree-ssa-loop-ivopts.h"
  63 #include "tree-ssa-loop-manip.h"
  64 #include "tree-ssa-loop.h"
  65 #include "cfgloop.h"
  66 #include "tree-chrec.h"
  67 #include "tree-scalar-evolution.h"
  68 #include "tree-vectorizer.h"
  69 #include "diagnostic-core.h"
  70 #include "hash-map.h"
  71 #include "plugin-api.h"
  72 #include "ipa-ref.h"
  73 #include "cgraph.h"
  74 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  75 #include "hashtab.h"
  76 #include "rtl.h"
  77 #include "flags.h"
  78 #include "statistics.h"
  79 #include "real.h"
  80 #include "fixed-value.h"
  81 #include "insn-config.h"
  82 #include "expmed.h"
  83 #include "dojump.h"
  84 #include "explow.h"
  85 #include "calls.h"
  86 #include "emit-rtl.h"
  87 #include "varasm.h"
  88 #include "stmt.h"
  89 #include "expr.h"
  90 #include "insn-codes.h"
  91 #include "optabs.h"
  92 #include "builtins.h"
  93
  94 /* Return true if load- or store-lanes optab OPTAB is implemented for
  95    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  96
  97 static bool
  98 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  99                               tree vectype, unsigned HOST_WIDE_INT count)
 100 {
 101   machine_mode mode, array_mode;
 102   bool limit_p;
 103
 104   mode = TYPE_MODE (vectype);
 105   limit_p = !targetm.array_mode_supported_p (mode, count);
 106   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
 107                               MODE_INT, limit_p);
 108
 109   if (array_mode == BLKmode)
 110     {
 111       if (dump_enabled_p ())
 112         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 113                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
 114                          GET_MODE_NAME (mode), count);
 115       return false;
 116     }
 117
 118   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 119     {
 120       if (dump_enabled_p ())
 121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 122                          "cannot use %s<%s><%s>\n", name,
 123                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 124       return false;
 125     }
 126
 127   if (dump_enabled_p ())
 128     dump_printf_loc (MSG_NOTE, vect_location,
 129                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 130                      GET_MODE_NAME (mode));
 131
 132   return true;
 133 }
 134
 135
 136 /* Return the smallest scalar part of STMT.
 137    This is used to determine the vectype of the stmt.  We generally set the
 138    vectype according to the type of the result (lhs).  For stmts whose
 139    result-type is different than the type of the arguments (e.g., demotion,
 140    promotion), vectype will be reset appropriately (later).  Note that we have
 141    to visit the smallest datatype in this function, because that determines the
 142    VF.  If the smallest datatype in the loop is present only as the rhs of a
 143    promotion operation - we'd miss it.
 144    Such a case, where a variable of this datatype does not appear in the lhs
 145    anywhere in the loop, can only occur if it's an invariant: e.g.:
 146    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 147    invariant motion.  However, we cannot rely on invariant motion to always
 148    take invariants out of the loop, and so in the case of promotion we also
 149    have to check the rhs.
 150    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 151    types.  */
 152
 153 tree
 154 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 155                                HOST_WIDE_INT *rhs_size_unit)
 156 {
 157   tree scalar_type = gimple_expr_type (stmt);
 158   HOST_WIDE_INT lhs, rhs;
 159
 160   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 161
 162   if (is_gimple_assign (stmt)
 163       && (gimple_assign_cast_p (stmt)
 164           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 165           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 166           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 167     {
 168       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 169
 170       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 171       if (rhs < lhs)
 172         scalar_type = rhs_type;
 173     }
 174
 175   *lhs_size_unit = lhs;
 176   *rhs_size_unit = rhs;
 177   return scalar_type;
 178 }
 179
 180
 181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 182    tested at run-time.  Return TRUE if DDR was successfully inserted.
 183    Return false if versioning is not supported.  */
 184
 185 static bool
 186 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 187 {
 188   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 189
 190   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 191     return false;
 192
 193   if (dump_enabled_p ())
 194     {
 195       dump_printf_loc (MSG_NOTE, vect_location,
 196                        "mark for run-time aliasing test between ");
 197       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 198       dump_printf (MSG_NOTE,  " and ");
 199       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 200       dump_printf (MSG_NOTE, "\n");
 201     }
 202
 203   if (optimize_loop_nest_for_size_p (loop))
 204     {
 205       if (dump_enabled_p ())
 206         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 207                          "versioning not supported when optimizing"
 208                          " for size.\n");
 209       return false;
 210     }
 211
 212   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 213   if (loop->inner)
 214     {
 215       if (dump_enabled_p ())
 216         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 217                          "versioning not yet supported for outer-loops.\n");
 218       return false;
 219     }
 220
 221   /* FORNOW: We don't support creating runtime alias tests for non-constant
 222      step.  */
 223   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 224       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 225     {
 226       if (dump_enabled_p ())
 227         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 228                          "versioning not yet supported for non-constant "
 229                          "step\n");
 230       return false;
 231     }
 232
 233   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 234   return true;
 235 }
 236
 237
 238 /* Function vect_analyze_data_ref_dependence.
 239
 240    Return TRUE if there (might) exist a dependence between a memory-reference
 241    DRA and a memory-reference DRB.  When versioning for alias may check a
 242    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 243    the data dependence.  */
 244
 245 static bool
 246 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 247                                   loop_vec_info loop_vinfo, int *max_vf)
 248 {
 249   unsigned int i;
 250   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 251   struct data_reference *dra = DDR_A (ddr);
 252   struct data_reference *drb = DDR_B (ddr);
 253   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 254   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 255   lambda_vector dist_v;
 256   unsigned int loop_depth;
 257
 258   /* In loop analysis all data references should be vectorizable.  */
 259   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 260       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 261     gcc_unreachable ();
 262
 263   /* Independent data accesses.  */
 264   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 265     return false;
 266
 267   if (dra == drb
 268       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 269     return false;
 270
 271   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 272      least two scalar iterations, there is always also a true dependence.
 273      As the vectorizer does not re-order loads and stores we can ignore
 274      the anti-dependence if TBAA can disambiguate both DRs similar to the
 275      case with known negative distance anti-dependences (positive
 276      distance anti-dependences would violate TBAA constraints).  */
 277   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 278        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 279       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 280                                  get_alias_set (DR_REF (drb))))
 281     return false;
 282
 283   /* Unknown data dependence.  */
 284   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 285     {
 286       /* If user asserted safelen consecutive iterations can be
 287          executed concurrently, assume independence.  */
 288       if (loop->safelen >= 2)
 289         {
 290           if (loop->safelen < *max_vf)
 291             *max_vf = loop->safelen;
 292           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 293           return false;
 294         }
 295
 296       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 297           || STMT_VINFO_GATHER_P (stmtinfo_b))
 298         {
 299           if (dump_enabled_p ())
 300             {
 301               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 302                                "versioning for alias not supported for: "
 303                                "can't determine dependence between ");
 304               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 305                                  DR_REF (dra));
 306               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 307               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 308                                  DR_REF (drb));
 309               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 310             }
 311           return true;
 312         }
 313
 314       if (dump_enabled_p ())
 315         {
 316           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 317                            "versioning for alias required: "
 318                            "can't determine dependence between ");
 319           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 320                              DR_REF (dra));
 321           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 322           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 323                              DR_REF (drb));
 324           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 325         }
 326
 327       /* Add to list of ddrs that need to be tested at run-time.  */
 328       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 329     }
 330
 331   /* Known data dependence.  */
 332   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 333     {
 334       /* If user asserted safelen consecutive iterations can be
 335          executed concurrently, assume independence.  */
 336       if (loop->safelen >= 2)
 337         {
 338           if (loop->safelen < *max_vf)
 339             *max_vf = loop->safelen;
 340           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 341           return false;
 342         }
 343
 344       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 345           || STMT_VINFO_GATHER_P (stmtinfo_b))
 346         {
 347           if (dump_enabled_p ())
 348             {
 349               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 350                                "versioning for alias not supported for: "
 351                                "bad dist vector for ");
 352               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 353                                  DR_REF (dra));
 354               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 355               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 356                                  DR_REF (drb));
 357               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 358             }
 359           return true;
 360         }
 361
 362       if (dump_enabled_p ())
 363         {
 364           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 365                            "versioning for alias required: "
 366                            "bad dist vector for ");
 367           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 368           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 369           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 370           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 371         }
 372       /* Add to list of ddrs that need to be tested at run-time.  */
 373       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 374     }
 375
 376   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 377   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 378     {
 379       int dist = dist_v[loop_depth];
 380
 381       if (dump_enabled_p ())
 382         dump_printf_loc (MSG_NOTE, vect_location,
 383                          "dependence distance  = %d.\n", dist);
 384
 385       if (dist == 0)
 386         {
 387           if (dump_enabled_p ())
 388             {
 389               dump_printf_loc (MSG_NOTE, vect_location,
 390                                "dependence distance == 0 between ");
 391               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 392               dump_printf (MSG_NOTE, " and ");
 393               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 394               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 395             }
 396
 397           /* When we perform grouped accesses and perform implicit CSE
 398              by detecting equal accesses and doing disambiguation with
 399              runtime alias tests like for
 400                 .. = a[i];
 401                 .. = a[i+1];
 402                 a[i] = ..;
 403                 a[i+1] = ..;
 404                 *p = ..;
 405                 .. = a[i];
 406                 .. = a[i+1];
 407              where we will end up loading { a[i], a[i+1] } once, make
 408              sure that inserting group loads before the first load and
 409              stores after the last store will do the right thing.
 410              Similar for groups like
 411                 a[i] = ...;
 412                 ... = a[i];
 413                 a[i+1] = ...;
 414              where loads from the group interleave with the store.  */
 415           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 416               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 417             {
 418               gimple earlier_stmt;
 419               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 420               if (DR_IS_WRITE
 421                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 422                 {
 423                   if (dump_enabled_p ())
 424                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 425                                      "READ_WRITE dependence in interleaving."
 426                                      "\n");
 427                   return true;
 428                 }
 429             }
 430
 431           continue;
 432         }
 433
 434       if (dist > 0 && DDR_REVERSED_P (ddr))
 435         {
 436           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 437              reversed (to make distance vector positive), and the actual
 438              distance is negative.  */
 439           if (dump_enabled_p ())
 440             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 441                              "dependence distance negative.\n");
 442           /* Record a negative dependence distance to later limit the
 443              amount of stmt copying / unrolling we can perform.
 444              Only need to handle read-after-write dependence.  */
 445           if (DR_IS_READ (drb)
 446               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 447                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 448             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 449           continue;
 450         }
 451
 452       if (abs (dist) >= 2
 453           && abs (dist) < *max_vf)
 454         {
 455           /* The dependence distance requires reduction of the maximal
 456              vectorization factor.  */
 457           *max_vf = abs (dist);
 458           if (dump_enabled_p ())
 459             dump_printf_loc (MSG_NOTE, vect_location,
 460                              "adjusting maximal vectorization factor to %i\n",
 461                              *max_vf);
 462         }
 463
 464       if (abs (dist) >= *max_vf)
 465         {
 466           /* Dependence distance does not create dependence, as far as
 467              vectorization is concerned, in this case.  */
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location,
 470                              "dependence distance >= VF.\n");
 471           continue;
 472         }
 473
 474       if (dump_enabled_p ())
 475         {
 476           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                        "not vectorized, possible dependence "
 478                        "between data-refs ");
 479           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 480           dump_printf (MSG_NOTE,  " and ");
 481           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 482           dump_printf (MSG_NOTE,  "\n");
 483         }
 484
 485       return true;
 486     }
 487
 488   return false;
 489 }
 490
 491 /* Function vect_analyze_data_ref_dependences.
 492
 493    Examine all the data references in the loop, and make sure there do not
 494    exist any data dependences between them.  Set *MAX_VF according to
 495    the maximum vectorization factor the data dependences allow.  */
 496
 497 bool
 498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 499 {
 500   unsigned int i;
 501   struct data_dependence_relation *ddr;
 502
 503   if (dump_enabled_p ())
 504     dump_printf_loc (MSG_NOTE, vect_location,
 505                      "=== vect_analyze_data_ref_dependences ===\n");
 506
 507   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 508   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 509                                 &LOOP_VINFO_DDRS (loop_vinfo),
 510                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 511     return false;
 512
 513   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 514     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 515       return false;
 516
 517   return true;
 518 }
 519
 520
 521 /* Function vect_slp_analyze_data_ref_dependence.
 522
 523    Return TRUE if there (might) exist a dependence between a memory-reference
 524    DRA and a memory-reference DRB.  When versioning for alias may check a
 525    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 526    the data dependence.  */
 527
 528 static bool
 529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 530 {
 531   struct data_reference *dra = DDR_A (ddr);
 532   struct data_reference *drb = DDR_B (ddr);
 533
 534   /* We need to check dependences of statements marked as unvectorizable
 535      as well, they still can prohibit vectorization.  */
 536
 537   /* Independent data accesses.  */
 538   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 539     return false;
 540
 541   if (dra == drb)
 542     return false;
 543
 544   /* Read-read is OK.  */
 545   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 546     return false;
 547
 548   /* If dra and drb are part of the same interleaving chain consider
 549      them independent.  */
 550   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 551       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 552           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 553     return false;
 554
 555   /* Unknown data dependence.  */
 556   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 557     {
 558       if  (dump_enabled_p ())
 559         {
 560           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 561                            "can't determine dependence between ");
 562           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 563           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 564           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 565           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 566         }
 567     }
 568   else if (dump_enabled_p ())
 569     {
 570       dump_printf_loc (MSG_NOTE, vect_location,
 571                        "determined dependence between ");
 572       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 573       dump_printf (MSG_NOTE, " and ");
 574       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 575       dump_printf (MSG_NOTE,  "\n");
 576     }
 577
 578   /* We do not vectorize basic blocks with write-write dependencies.  */
 579   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 580     return true;
 581
 582   /* If we have a read-write dependence check that the load is before the store.
 583      When we vectorize basic blocks, vector load can be only before
 584      corresponding scalar load, and vector store can be only after its
 585      corresponding scalar store.  So the order of the acceses is preserved in
 586      case the load is before the store.  */
 587   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 588   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 589     {
 590       /* That only holds for load-store pairs taking part in vectorization.  */
 591       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 592           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 593         return false;
 594     }
 595
 596   return true;
 597 }
 598
 599
 600 /* Function vect_analyze_data_ref_dependences.
 601
 602    Examine all the data references in the basic-block, and make sure there
 603    do not exist any data dependences between them.  Set *MAX_VF according to
 604    the maximum vectorization factor the data dependences allow.  */
 605
 606 bool
 607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 608 {
 609   struct data_dependence_relation *ddr;
 610   unsigned int i;
 611
 612   if (dump_enabled_p ())
 613     dump_printf_loc (MSG_NOTE, vect_location,
 614                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 615
 616   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 617                                 &BB_VINFO_DDRS (bb_vinfo),
 618                                 vNULL, true))
 619     return false;
 620
 621   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 622     if (vect_slp_analyze_data_ref_dependence (ddr))
 623       return false;
 624
 625   return true;
 626 }
 627
 628
 629 /* Function vect_compute_data_ref_alignment
 630
 631    Compute the misalignment of the data reference DR.
 632
 633    Output:
 634    1. If during the misalignment computation it is found that the data reference
 635       cannot be vectorized then false is returned.
 636    2. DR_MISALIGNMENT (DR) is defined.
 637
 638    FOR NOW: No analysis is actually performed. Misalignment is calculated
 639    only for trivial cases. TODO.  */
 640
 641 static bool
 642 vect_compute_data_ref_alignment (struct data_reference *dr)
 643 {
 644   gimple stmt = DR_STMT (dr);
 645   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 646   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 647   struct loop *loop = NULL;
 648   tree ref = DR_REF (dr);
 649   tree vectype;
 650   tree base, base_addr;
 651   bool base_aligned;
 652   tree misalign = NULL_TREE;
 653   tree aligned_to;
 654   unsigned HOST_WIDE_INT alignment;
 655
 656   if (dump_enabled_p ())
 657     dump_printf_loc (MSG_NOTE, vect_location,
 658                      "vect_compute_data_ref_alignment:\n");
 659
 660   if (loop_vinfo)
 661     loop = LOOP_VINFO_LOOP (loop_vinfo);
 662
 663   /* Initialize misalignment to unknown.  */
 664   SET_DR_MISALIGNMENT (dr, -1);
 665
 666   /* Strided accesses perform only component accesses, misalignment information
 667      is irrelevant for them.  */
 668   if (STMT_VINFO_STRIDED_P (stmt_info)
 669       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 670     return true;
 671
 672   if (tree_fits_shwi_p (DR_STEP (dr)))
 673     misalign = DR_INIT (dr);
 674   aligned_to = DR_ALIGNED_TO (dr);
 675   base_addr = DR_BASE_ADDRESS (dr);
 676   vectype = STMT_VINFO_VECTYPE (stmt_info);
 677
 678   /* In case the dataref is in an inner-loop of the loop that is being
 679      vectorized (LOOP), we use the base and misalignment information
 680      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 681      stays the same throughout the execution of the inner-loop, which is why
 682      we have to check that the stride of the dataref in the inner-loop evenly
 683      divides by the vector size.  */
 684   if (loop && nested_in_vect_loop_p (loop, stmt))
 685     {
 686       tree step = DR_STEP (dr);
 687
 688       if (tree_fits_shwi_p (step)
 689           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 690         {
 691           if (dump_enabled_p ())
 692             dump_printf_loc (MSG_NOTE, vect_location,
 693                              "inner step divides the vector-size.\n");
 694           misalign = STMT_VINFO_DR_INIT (stmt_info);
 695           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 696           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 697         }
 698       else
 699         {
 700           if (dump_enabled_p ())
 701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 702                              "inner step doesn't divide the vector-size.\n");
 703           misalign = NULL_TREE;
 704         }
 705     }
 706
 707   /* Similarly, if we're doing basic-block vectorization, we can only use
 708      base and misalignment information relative to an innermost loop if the
 709      misalignment stays the same throughout the execution of the loop.
 710      As above, this is the case if the stride of the dataref evenly divides
 711      by the vector size.  */
 712   if (!loop)
 713     {
 714       tree step = DR_STEP (dr);
 715
 716       if (tree_fits_shwi_p (step)
 717           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 718         {
 719           if (dump_enabled_p ())
 720             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                              "SLP: step doesn't divide the vector-size.\n");
 722           misalign = NULL_TREE;
 723         }
 724     }
 725
 726   alignment = TYPE_ALIGN_UNIT (vectype);
 727
 728   if ((compare_tree_int (aligned_to, alignment) < 0)
 729       || !misalign)
 730     {
 731       if (dump_enabled_p ())
 732         {
 733           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 734                            "Unknown alignment for access: ");
 735           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 736           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 737         }
 738       return true;
 739     }
 740
 741   /* To look at alignment of the base we have to preserve an inner MEM_REF
 742      as that carries alignment information of the actual access.  */
 743   base = ref;
 744   while (handled_component_p (base))
 745     base = TREE_OPERAND (base, 0);
 746   if (TREE_CODE (base) == MEM_REF)
 747     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 748                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 749
 750   if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
 751     base_aligned = true;
 752   else
 753     base_aligned = false;
 754
 755   if (!base_aligned)
 756     {
 757       /* Strip an inner MEM_REF to a bare decl if possible.  */
 758       if (TREE_CODE (base) == MEM_REF
 759           && integer_zerop (TREE_OPERAND (base, 1))
 760           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 761         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 762
 763       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 764         {
 765           if (dump_enabled_p ())
 766             {
 767               dump_printf_loc (MSG_NOTE, vect_location,
 768                                "can't force alignment of ref: ");
 769               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 770               dump_printf (MSG_NOTE, "\n");
 771             }
 772           return true;
 773         }
 774
 775       /* Force the alignment of the decl.
 776          NOTE: This is the only change to the code we make during
 777          the analysis phase, before deciding to vectorize the loop.  */
 778       if (dump_enabled_p ())
 779         {
 780           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 781           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 782           dump_printf (MSG_NOTE, "\n");
 783         }
 784
 785       ((dataref_aux *)dr->aux)->base_decl = base;
 786       ((dataref_aux *)dr->aux)->base_misaligned = true;
 787     }
 788
 789   /* If this is a backward running DR then first access in the larger
 790      vectype actually is N-1 elements before the address in the DR.
 791      Adjust misalign accordingly.  */
 792   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 793     {
 794       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 795       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 796          otherwise we wouldn't be here.  */
 797       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 798       /* PLUS because DR_STEP was negative.  */
 799       misalign = size_binop (PLUS_EXPR, misalign, offset);
 800     }
 801
 802   SET_DR_MISALIGNMENT (dr,
 803                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 804
 805   if (dump_enabled_p ())
 806     {
 807       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 808                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 809       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 810       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 811     }
 812
 813   return true;
 814 }
 815
 816
 817 /* Function vect_compute_data_refs_alignment
 818
 819    Compute the misalignment of data references in the loop.
 820    Return FALSE if a data reference is found that cannot be vectorized.  */
 821
 822 static bool
 823 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 824                                   bb_vec_info bb_vinfo)
 825 {
 826   vec<data_reference_p> datarefs;
 827   struct data_reference *dr;
 828   unsigned int i;
 829
 830   if (loop_vinfo)
 831     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 832   else
 833     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 834
 835   FOR_EACH_VEC_ELT (datarefs, i, dr)
 836     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 837         && !vect_compute_data_ref_alignment (dr))
 838       {
 839         if (bb_vinfo)
 840           {
 841             /* Mark unsupported statement as unvectorizable.  */
 842             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 843             continue;
 844           }
 845         else
 846           return false;
 847       }
 848
 849   return true;
 850 }
 851
 852
 853 /* Function vect_update_misalignment_for_peel
 854
 855    DR - the data reference whose misalignment is to be adjusted.
 856    DR_PEEL - the data reference whose misalignment is being made
 857              zero in the vector loop by the peel.
 858    NPEEL - the number of iterations in the peel loop if the misalignment
 859            of DR_PEEL is known at compile time.  */
 860
 861 static void
 862 vect_update_misalignment_for_peel (struct data_reference *dr,
 863                                    struct data_reference *dr_peel, int npeel)
 864 {
 865   unsigned int i;
 866   vec<dr_p> same_align_drs;
 867   struct data_reference *current_dr;
 868   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 869   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 870   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 871   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 872
 873  /* For interleaved data accesses the step in the loop must be multiplied by
 874      the size of the interleaving group.  */
 875   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 876     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 877   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 878     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 879
 880   /* It can be assumed that the data refs with the same alignment as dr_peel
 881      are aligned in the vector loop.  */
 882   same_align_drs
 883     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 884   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 885     {
 886       if (current_dr != dr)
 887         continue;
 888       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 889                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 890       SET_DR_MISALIGNMENT (dr, 0);
 891       return;
 892     }
 893
 894   if (known_alignment_for_access_p (dr)
 895       && known_alignment_for_access_p (dr_peel))
 896     {
 897       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 898       int misal = DR_MISALIGNMENT (dr);
 899       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 900       misal += negative ? -npeel * dr_size : npeel * dr_size;
 901       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 902       SET_DR_MISALIGNMENT (dr, misal);
 903       return;
 904     }
 905
 906   if (dump_enabled_p ())
 907     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 908   SET_DR_MISALIGNMENT (dr, -1);
 909 }
 910
 911
 912 /* Function vect_verify_datarefs_alignment
 913
 914    Return TRUE if all data references in the loop can be
 915    handled with respect to alignment.  */
 916
 917 bool
 918 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 919 {
 920   vec<data_reference_p> datarefs;
 921   struct data_reference *dr;
 922   enum dr_alignment_support supportable_dr_alignment;
 923   unsigned int i;
 924
 925   if (loop_vinfo)
 926     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 927   else
 928     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 929
 930   FOR_EACH_VEC_ELT (datarefs, i, dr)
 931     {
 932       gimple stmt = DR_STMT (dr);
 933       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 934
 935       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 936         continue;
 937
 938       /* For interleaving, only the alignment of the first access matters.
 939          Skip statements marked as not vectorizable.  */
 940       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 941            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 942           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 943         continue;
 944
 945       /* Strided accesses perform only component accesses, alignment is
 946          irrelevant for them.  */
 947       if (STMT_VINFO_STRIDED_P (stmt_info)
 948           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 949         continue;
 950
 951       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 952       if (!supportable_dr_alignment)
 953         {
 954           if (dump_enabled_p ())
 955             {
 956               if (DR_IS_READ (dr))
 957                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 958                                  "not vectorized: unsupported unaligned load.");
 959               else
 960                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 961                                  "not vectorized: unsupported unaligned "
 962                                  "store.");
 963
 964               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 965                                  DR_REF (dr));
 966               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 967             }
 968           return false;
 969         }
 970       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 971         dump_printf_loc (MSG_NOTE, vect_location,
 972                          "Vectorizing an unaligned access.\n");
 973     }
 974   return true;
 975 }
 976
 977 /* Given an memory reference EXP return whether its alignment is less
 978    than its size.  */
 979
 980 static bool
 981 not_size_aligned (tree exp)
 982 {
 983   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 984     return true;
 985
 986   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 987           > get_object_alignment (exp));
 988 }
 989
 990 /* Function vector_alignment_reachable_p
 991
 992    Return true if vector alignment for DR is reachable by peeling
 993    a few loop iterations.  Return false otherwise.  */
 994
 995 static bool
 996 vector_alignment_reachable_p (struct data_reference *dr)
 997 {
 998   gimple stmt = DR_STMT (dr);
 999   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1000   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1001
1002   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1003     {
1004       /* For interleaved access we peel only if number of iterations in
1005          the prolog loop ({VF - misalignment}), is a multiple of the
1006          number of the interleaved accesses.  */
1007       int elem_size, mis_in_elements;
1008       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1009
1010       /* FORNOW: handle only known alignment.  */
1011       if (!known_alignment_for_access_p (dr))
1012         return false;
1013
1014       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1015       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1016
1017       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1018         return false;
1019     }
1020
1021   /* If misalignment is known at the compile time then allow peeling
1022      only if natural alignment is reachable through peeling.  */
1023   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1024     {
1025       HOST_WIDE_INT elmsize =
1026                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1027       if (dump_enabled_p ())
1028         {
1029           dump_printf_loc (MSG_NOTE, vect_location,
1030                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1031           dump_printf (MSG_NOTE,
1032                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1033         }
1034       if (DR_MISALIGNMENT (dr) % elmsize)
1035         {
1036           if (dump_enabled_p ())
1037             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038                              "data size does not divide the misalignment.\n");
1039           return false;
1040         }
1041     }
1042
1043   if (!known_alignment_for_access_p (dr))
1044     {
1045       tree type = TREE_TYPE (DR_REF (dr));
1046       bool is_packed = not_size_aligned (DR_REF (dr));
1047       if (dump_enabled_p ())
1048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049                          "Unknown misalignment, is_packed = %d\n",is_packed);
1050       if ((TYPE_USER_ALIGN (type) && !is_packed)
1051           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1052         return true;
1053       else
1054         return false;
1055     }
1056
1057   return true;
1058 }
1059
1060
1061 /* Calculate the cost of the memory access represented by DR.  */
1062
1063 static void
1064 vect_get_data_access_cost (struct data_reference *dr,
1065                            unsigned int *inside_cost,
1066                            unsigned int *outside_cost,
1067                            stmt_vector_for_cost *body_cost_vec)
1068 {
1069   gimple stmt = DR_STMT (dr);
1070   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1071   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1072   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1073   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1074   int ncopies = vf / nunits;
1075
1076   if (DR_IS_READ (dr))
1077     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1078                         NULL, body_cost_vec, false);
1079   else
1080     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1081
1082   if (dump_enabled_p ())
1083     dump_printf_loc (MSG_NOTE, vect_location,
1084                      "vect_get_data_access_cost: inside_cost = %d, "
1085                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1086 }
1087
1088
1089 /* Insert DR into peeling hash table with NPEEL as key.  */
1090
1091 static void
1092 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1093                           int npeel)
1094 {
1095   struct _vect_peel_info elem, *slot;
1096   _vect_peel_info **new_slot;
1097   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1098
1099   elem.npeel = npeel;
1100   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1101   if (slot)
1102     slot->count++;
1103   else
1104     {
1105       slot = XNEW (struct _vect_peel_info);
1106       slot->npeel = npeel;
1107       slot->dr = dr;
1108       slot->count = 1;
1109       new_slot
1110         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1111       *new_slot = slot;
1112     }
1113
1114   if (!supportable_dr_alignment
1115       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1116     slot->count += VECT_MAX_COST;
1117 }
1118
1119
1120 /* Traverse peeling hash table to find peeling option that aligns maximum
1121    number of data accesses.  */
1122
1123 int
1124 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1125                                      _vect_peel_extended_info *max)
1126 {
1127   vect_peel_info elem = *slot;
1128
1129   if (elem->count > max->peel_info.count
1130       || (elem->count == max->peel_info.count
1131           && max->peel_info.npeel > elem->npeel))
1132     {
1133       max->peel_info.npeel = elem->npeel;
1134       max->peel_info.count = elem->count;
1135       max->peel_info.dr = elem->dr;
1136     }
1137
1138   return 1;
1139 }
1140
1141
1142 /* Traverse peeling hash table and calculate cost for each peeling option.
1143    Find the one with the lowest cost.  */
1144
1145 int
1146 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1147                                    _vect_peel_extended_info *min)
1148 {
1149   vect_peel_info elem = *slot;
1150   int save_misalignment, dummy;
1151   unsigned int inside_cost = 0, outside_cost = 0, i;
1152   gimple stmt = DR_STMT (elem->dr);
1153   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1154   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1155   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1156   struct data_reference *dr;
1157   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1158
1159   prologue_cost_vec.create (2);
1160   body_cost_vec.create (2);
1161   epilogue_cost_vec.create (2);
1162
1163   FOR_EACH_VEC_ELT (datarefs, i, dr)
1164     {
1165       stmt = DR_STMT (dr);
1166       stmt_info = vinfo_for_stmt (stmt);
1167       /* For interleaving, only the alignment of the first access
1168          matters.  */
1169       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1171         continue;
1172
1173       save_misalignment = DR_MISALIGNMENT (dr);
1174       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1175       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1176                                  &body_cost_vec);
1177       SET_DR_MISALIGNMENT (dr, save_misalignment);
1178     }
1179
1180   auto_vec<stmt_info_for_cost> scalar_cost_vec;
1181   vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
1182   outside_cost += vect_get_known_peeling_cost
1183     (loop_vinfo, elem->npeel, &dummy,
1184      &scalar_cost_vec, &prologue_cost_vec, &epilogue_cost_vec);
1185
1186   /* Prologue and epilogue costs are added to the target model later.
1187      These costs depend only on the scalar iteration cost, the
1188      number of peeling iterations finally chosen, and the number of
1189      misaligned statements.  So discard the information found here.  */
1190   prologue_cost_vec.release ();
1191   epilogue_cost_vec.release ();
1192
1193   if (inside_cost < min->inside_cost
1194       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1195     {
1196       min->inside_cost = inside_cost;
1197       min->outside_cost = outside_cost;
1198       min->body_cost_vec.release ();
1199       min->body_cost_vec = body_cost_vec;
1200       min->peel_info.dr = elem->dr;
1201       min->peel_info.npeel = elem->npeel;
1202     }
1203   else
1204     body_cost_vec.release ();
1205
1206   return 1;
1207 }
1208
1209
1210 /* Choose best peeling option by traversing peeling hash table and either
1211    choosing an option with the lowest cost (if cost model is enabled) or the
1212    option that aligns as many accesses as possible.  */
1213
1214 static struct data_reference *
1215 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1216                                        unsigned int *npeel,
1217                                        stmt_vector_for_cost *body_cost_vec)
1218 {
1219    struct _vect_peel_extended_info res;
1220
1221    res.peel_info.dr = NULL;
1222    res.body_cost_vec = stmt_vector_for_cost ();
1223
1224    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1225      {
1226        res.inside_cost = INT_MAX;
1227        res.outside_cost = INT_MAX;
1228        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1229            ->traverse <_vect_peel_extended_info *,
1230                        vect_peeling_hash_get_lowest_cost> (&res);
1231      }
1232    else
1233      {
1234        res.peel_info.count = 0;
1235        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1236            ->traverse <_vect_peel_extended_info *,
1237                        vect_peeling_hash_get_most_frequent> (&res);
1238      }
1239
1240    *npeel = res.peel_info.npeel;
1241    *body_cost_vec = res.body_cost_vec;
1242    return res.peel_info.dr;
1243 }
1244
1245
1246 /* Function vect_enhance_data_refs_alignment
1247
1248    This pass will use loop versioning and loop peeling in order to enhance
1249    the alignment of data references in the loop.
1250
1251    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1252    original loop is to be vectorized.  Any other loops that are created by
1253    the transformations performed in this pass - are not supposed to be
1254    vectorized.  This restriction will be relaxed.
1255
1256    This pass will require a cost model to guide it whether to apply peeling
1257    or versioning or a combination of the two.  For example, the scheme that
1258    intel uses when given a loop with several memory accesses, is as follows:
1259    choose one memory access ('p') which alignment you want to force by doing
1260    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1261    other accesses are not necessarily aligned, or (2) use loop versioning to
1262    generate one loop in which all accesses are aligned, and another loop in
1263    which only 'p' is necessarily aligned.
1264
1265    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1266    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1267    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1268
1269    Devising a cost model is the most critical aspect of this work.  It will
1270    guide us on which access to peel for, whether to use loop versioning, how
1271    many versions to create, etc.  The cost model will probably consist of
1272    generic considerations as well as target specific considerations (on
1273    powerpc for example, misaligned stores are more painful than misaligned
1274    loads).
1275
1276    Here are the general steps involved in alignment enhancements:
1277
1278      -- original loop, before alignment analysis:
1279         for (i=0; i<N; i++){
1280           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1281           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1282         }
1283
1284      -- After vect_compute_data_refs_alignment:
1285         for (i=0; i<N; i++){
1286           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1287           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1288         }
1289
1290      -- Possibility 1: we do loop versioning:
1291      if (p is aligned) {
1292         for (i=0; i<N; i++){    # loop 1A
1293           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1294           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1295         }
1296      }
1297      else {
1298         for (i=0; i<N; i++){    # loop 1B
1299           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1300           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1301         }
1302      }
1303
1304      -- Possibility 2: we do loop peeling:
1305      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1306         x = q[i];
1307         p[i] = y;
1308      }
1309      for (i = 3; i < N; i++){   # loop 2A
1310         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1311         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1312      }
1313
1314      -- Possibility 3: combination of loop peeling and versioning:
1315      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1316         x = q[i];
1317         p[i] = y;
1318      }
1319      if (p is aligned) {
1320         for (i = 3; i<N; i++){  # loop 3A
1321           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1322           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1323         }
1324      }
1325      else {
1326         for (i = 3; i<N; i++){  # loop 3B
1327           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1328           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1329         }
1330      }
1331
1332      These loops are later passed to loop_transform to be vectorized.  The
1333      vectorizer will use the alignment information to guide the transformation
1334      (whether to generate regular loads/stores, or with special handling for
1335      misalignment).  */
1336
1337 bool
1338 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1339 {
1340   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1341   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1342   enum dr_alignment_support supportable_dr_alignment;
1343   struct data_reference *dr0 = NULL, *first_store = NULL;
1344   struct data_reference *dr;
1345   unsigned int i, j;
1346   bool do_peeling = false;
1347   bool do_versioning = false;
1348   bool stat;
1349   gimple stmt;
1350   stmt_vec_info stmt_info;
1351   unsigned int npeel = 0;
1352   bool all_misalignments_unknown = true;
1353   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1354   unsigned possible_npeel_number = 1;
1355   tree vectype;
1356   unsigned int nelements, mis, same_align_drs_max = 0;
1357   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1358
1359   if (dump_enabled_p ())
1360     dump_printf_loc (MSG_NOTE, vect_location,
1361                      "=== vect_enhance_data_refs_alignment ===\n");
1362
1363   /* While cost model enhancements are expected in the future, the high level
1364      view of the code at this time is as follows:
1365
1366      A) If there is a misaligned access then see if peeling to align
1367         this access can make all data references satisfy
1368         vect_supportable_dr_alignment.  If so, update data structures
1369         as needed and return true.
1370
1371      B) If peeling wasn't possible and there is a data reference with an
1372         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1373         then see if loop versioning checks can be used to make all data
1374         references satisfy vect_supportable_dr_alignment.  If so, update
1375         data structures as needed and return true.
1376
1377      C) If neither peeling nor versioning were successful then return false if
1378         any data reference does not satisfy vect_supportable_dr_alignment.
1379
1380      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1381
1382      Note, Possibility 3 above (which is peeling and versioning together) is not
1383      being done at this time.  */
1384
1385   /* (1) Peeling to force alignment.  */
1386
1387   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1388      Considerations:
1389      + How many accesses will become aligned due to the peeling
1390      - How many accesses will become unaligned due to the peeling,
1391        and the cost of misaligned accesses.
1392      - The cost of peeling (the extra runtime checks, the increase
1393        in code size).  */
1394
1395   FOR_EACH_VEC_ELT (datarefs, i, dr)
1396     {
1397       stmt = DR_STMT (dr);
1398       stmt_info = vinfo_for_stmt (stmt);
1399
1400       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1401         continue;
1402
1403       /* For interleaving, only the alignment of the first access
1404          matters.  */
1405       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1406           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1407         continue;
1408
1409       /* For invariant accesses there is nothing to enhance.  */
1410       if (integer_zerop (DR_STEP (dr)))
1411         continue;
1412
1413       /* Strided accesses perform only component accesses, alignment is
1414          irrelevant for them.  */
1415       if (STMT_VINFO_STRIDED_P (stmt_info)
1416           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1417         continue;
1418
1419       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1420       do_peeling = vector_alignment_reachable_p (dr);
1421       if (do_peeling)
1422         {
1423           if (known_alignment_for_access_p (dr))
1424             {
1425               unsigned int npeel_tmp;
1426               bool negative = tree_int_cst_compare (DR_STEP (dr),
1427                                                     size_zero_node) < 0;
1428
1429               /* Save info about DR in the hash table.  */
1430               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1431                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1432                   = new hash_table<peel_info_hasher> (1);
1433
1434               vectype = STMT_VINFO_VECTYPE (stmt_info);
1435               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1436               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1437                                                 TREE_TYPE (DR_REF (dr))));
1438               npeel_tmp = (negative
1439                            ? (mis - nelements) : (nelements - mis))
1440                   & (nelements - 1);
1441
1442               /* For multiple types, it is possible that the bigger type access
1443                  will have more than one peeling option.  E.g., a loop with two
1444                  types: one of size (vector size / 4), and the other one of
1445                  size (vector size / 8).  Vectorization factor will 8.  If both
1446                  access are misaligned by 3, the first one needs one scalar
1447                  iteration to be aligned, and the second one needs 5.  But the
1448                  the first one will be aligned also by peeling 5 scalar
1449                  iterations, and in that case both accesses will be aligned.
1450                  Hence, except for the immediate peeling amount, we also want
1451                  to try to add full vector size, while we don't exceed
1452                  vectorization factor.
1453                  We do this automtically for cost model, since we calculate cost
1454                  for every peeling option.  */
1455               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1456                 possible_npeel_number = vf /nelements;
1457
1458               /* Handle the aligned case. We may decide to align some other
1459                  access, making DR unaligned.  */
1460               if (DR_MISALIGNMENT (dr) == 0)
1461                 {
1462                   npeel_tmp = 0;
1463                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1464                     possible_npeel_number++;
1465                 }
1466
1467               for (j = 0; j < possible_npeel_number; j++)
1468                 {
1469                   gcc_assert (npeel_tmp <= vf);
1470                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1471                   npeel_tmp += nelements;
1472                 }
1473
1474               all_misalignments_unknown = false;
1475               /* Data-ref that was chosen for the case that all the
1476                  misalignments are unknown is not relevant anymore, since we
1477                  have a data-ref with known alignment.  */
1478               dr0 = NULL;
1479             }
1480           else
1481             {
1482               /* If we don't know any misalignment values, we prefer
1483                  peeling for data-ref that has the maximum number of data-refs
1484                  with the same alignment, unless the target prefers to align
1485                  stores over load.  */
1486               if (all_misalignments_unknown)
1487                 {
1488                   unsigned same_align_drs
1489                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1490                   if (!dr0
1491                       || same_align_drs_max < same_align_drs)
1492                     {
1493                       same_align_drs_max = same_align_drs;
1494                       dr0 = dr;
1495                     }
1496                   /* For data-refs with the same number of related
1497                      accesses prefer the one where the misalign
1498                      computation will be invariant in the outermost loop.  */
1499                   else if (same_align_drs_max == same_align_drs)
1500                     {
1501                       struct loop *ivloop0, *ivloop;
1502                       ivloop0 = outermost_invariant_loop_for_expr
1503                           (loop, DR_BASE_ADDRESS (dr0));
1504                       ivloop = outermost_invariant_loop_for_expr
1505                           (loop, DR_BASE_ADDRESS (dr));
1506                       if ((ivloop && !ivloop0)
1507                           || (ivloop && ivloop0
1508                               && flow_loop_nested_p (ivloop, ivloop0)))
1509                         dr0 = dr;
1510                     }
1511
1512                   if (!first_store && DR_IS_WRITE (dr))
1513                     first_store = dr;
1514                 }
1515
1516               /* If there are both known and unknown misaligned accesses in the
1517                  loop, we choose peeling amount according to the known
1518                  accesses.  */
1519               if (!supportable_dr_alignment)
1520                 {
1521                   dr0 = dr;
1522                   if (!first_store && DR_IS_WRITE (dr))
1523                     first_store = dr;
1524                 }
1525             }
1526         }
1527       else
1528         {
1529           if (!aligned_access_p (dr))
1530             {
1531               if (dump_enabled_p ())
1532                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                                  "vector alignment may not be reachable\n");
1534               break;
1535             }
1536         }
1537     }
1538
1539   /* Check if we can possibly peel the loop.  */
1540   if (!vect_can_advance_ivs_p (loop_vinfo)
1541       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1542     do_peeling = false;
1543
1544   if (do_peeling
1545       && all_misalignments_unknown
1546       && vect_supportable_dr_alignment (dr0, false))
1547     {
1548       /* Check if the target requires to prefer stores over loads, i.e., if
1549          misaligned stores are more expensive than misaligned loads (taking
1550          drs with same alignment into account).  */
1551       if (first_store && DR_IS_READ (dr0))
1552         {
1553           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1554           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1555           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1556           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1557           stmt_vector_for_cost dummy;
1558           dummy.create (2);
1559
1560           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1561                                      &dummy);
1562           vect_get_data_access_cost (first_store, &store_inside_cost,
1563                                      &store_outside_cost, &dummy);
1564
1565           dummy.release ();
1566
1567           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1568              aligning the load DR0).  */
1569           load_inside_penalty = store_inside_cost;
1570           load_outside_penalty = store_outside_cost;
1571           for (i = 0;
1572                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1573                           DR_STMT (first_store))).iterate (i, &dr);
1574                i++)
1575             if (DR_IS_READ (dr))
1576               {
1577                 load_inside_penalty += load_inside_cost;
1578                 load_outside_penalty += load_outside_cost;
1579               }
1580             else
1581               {
1582                 load_inside_penalty += store_inside_cost;
1583                 load_outside_penalty += store_outside_cost;
1584               }
1585
1586           /* Calculate the penalty for leaving DR0 unaligned (by
1587              aligning the FIRST_STORE).  */
1588           store_inside_penalty = load_inside_cost;
1589           store_outside_penalty = load_outside_cost;
1590           for (i = 0;
1591                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1592                       DR_STMT (dr0))).iterate (i, &dr);
1593                i++)
1594             if (DR_IS_READ (dr))
1595               {
1596                 store_inside_penalty += load_inside_cost;
1597                 store_outside_penalty += load_outside_cost;
1598               }
1599             else
1600               {
1601                 store_inside_penalty += store_inside_cost;
1602                 store_outside_penalty += store_outside_cost;
1603               }
1604
1605           if (load_inside_penalty > store_inside_penalty
1606               || (load_inside_penalty == store_inside_penalty
1607                   && load_outside_penalty > store_outside_penalty))
1608             dr0 = first_store;
1609         }
1610
1611       /* In case there are only loads with different unknown misalignments, use
1612          peeling only if it may help to align other accesses in the loop or
1613          if it may help improving load bandwith when we'd end up using
1614          unaligned loads.  */
1615       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1616       if (!first_store
1617           && !STMT_VINFO_SAME_ALIGN_REFS (
1618                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1619           && (vect_supportable_dr_alignment (dr0, false)
1620               != dr_unaligned_supported
1621               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1622                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1623         do_peeling = false;
1624     }
1625
1626   if (do_peeling && !dr0)
1627     {
1628       /* Peeling is possible, but there is no data access that is not supported
1629          unless aligned. So we try to choose the best possible peeling.  */
1630
1631       /* We should get here only if there are drs with known misalignment.  */
1632       gcc_assert (!all_misalignments_unknown);
1633
1634       /* Choose the best peeling from the hash table.  */
1635       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1636                                                    &body_cost_vec);
1637       if (!dr0 || !npeel)
1638         do_peeling = false;
1639     }
1640
1641   if (do_peeling)
1642     {
1643       stmt = DR_STMT (dr0);
1644       stmt_info = vinfo_for_stmt (stmt);
1645       vectype = STMT_VINFO_VECTYPE (stmt_info);
1646       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647
1648       if (known_alignment_for_access_p (dr0))
1649         {
1650           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1651                                                 size_zero_node) < 0;
1652           if (!npeel)
1653             {
1654               /* Since it's known at compile time, compute the number of
1655                  iterations in the peeled loop (the peeling factor) for use in
1656                  updating DR_MISALIGNMENT values.  The peeling factor is the
1657                  vectorization factor minus the misalignment as an element
1658                  count.  */
1659               mis = DR_MISALIGNMENT (dr0);
1660               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1661               npeel = ((negative ? mis - nelements : nelements - mis)
1662                        & (nelements - 1));
1663             }
1664
1665           /* For interleaved data access every iteration accesses all the
1666              members of the group, therefore we divide the number of iterations
1667              by the group size.  */
1668           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1669           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1670             npeel /= GROUP_SIZE (stmt_info);
1671
1672           if (dump_enabled_p ())
1673             dump_printf_loc (MSG_NOTE, vect_location,
1674                              "Try peeling by %d\n", npeel);
1675         }
1676
1677       /* Ensure that all data refs can be vectorized after the peel.  */
1678       FOR_EACH_VEC_ELT (datarefs, i, dr)
1679         {
1680           int save_misalignment;
1681
1682           if (dr == dr0)
1683             continue;
1684
1685           stmt = DR_STMT (dr);
1686           stmt_info = vinfo_for_stmt (stmt);
1687           /* For interleaving, only the alignment of the first access
1688             matters.  */
1689           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1690               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1691             continue;
1692
1693           /* Strided accesses perform only component accesses, alignment is
1694              irrelevant for them.  */
1695           if (STMT_VINFO_STRIDED_P (stmt_info)
1696               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1697             continue;
1698
1699           save_misalignment = DR_MISALIGNMENT (dr);
1700           vect_update_misalignment_for_peel (dr, dr0, npeel);
1701           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1702           SET_DR_MISALIGNMENT (dr, save_misalignment);
1703
1704           if (!supportable_dr_alignment)
1705             {
1706               do_peeling = false;
1707               break;
1708             }
1709         }
1710
1711       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1712         {
1713           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1714           if (!stat)
1715             do_peeling = false;
1716           else
1717             {
1718               body_cost_vec.release ();
1719               return stat;
1720             }
1721         }
1722
1723       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1724       if (do_peeling)
1725         {
1726           unsigned max_allowed_peel
1727             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1728           if (max_allowed_peel != (unsigned)-1)
1729             {
1730               unsigned max_peel = npeel;
1731               if (max_peel == 0)
1732                 {
1733                   gimple dr_stmt = DR_STMT (dr0);
1734                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1735                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1736                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1737                 }
1738               if (max_peel > max_allowed_peel)
1739                 {
1740                   do_peeling = false;
1741                   if (dump_enabled_p ())
1742                     dump_printf_loc (MSG_NOTE, vect_location,
1743                         "Disable peeling, max peels reached: %d\n", max_peel);
1744                 }
1745             }
1746         }
1747
1748       /* Cost model #2 - if peeling may result in a remaining loop not
1749          iterating enough to be vectorized then do not peel.  */
1750       if (do_peeling
1751           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1752         {
1753           unsigned max_peel
1754             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1755           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1756               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1757             do_peeling = false;
1758         }
1759
1760       if (do_peeling)
1761         {
1762           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1763              If the misalignment of DR_i is identical to that of dr0 then set
1764              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1765              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1766              by the peeling factor times the element size of DR_i (MOD the
1767              vectorization factor times the size).  Otherwise, the
1768              misalignment of DR_i must be set to unknown.  */
1769           FOR_EACH_VEC_ELT (datarefs, i, dr)
1770             if (dr != dr0)
1771               vect_update_misalignment_for_peel (dr, dr0, npeel);
1772
1773           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1774           if (npeel)
1775             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1776           else
1777             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1778               = DR_MISALIGNMENT (dr0);
1779           SET_DR_MISALIGNMENT (dr0, 0);
1780           if (dump_enabled_p ())
1781             {
1782               dump_printf_loc (MSG_NOTE, vect_location,
1783                                "Alignment of access forced using peeling.\n");
1784               dump_printf_loc (MSG_NOTE, vect_location,
1785                                "Peeling for alignment will be applied.\n");
1786             }
1787           /* The inside-loop cost will be accounted for in vectorizable_load
1788              and vectorizable_store correctly with adjusted alignments.
1789              Drop the body_cst_vec on the floor here.  */
1790           body_cost_vec.release ();
1791
1792           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1793           gcc_assert (stat);
1794           return stat;
1795         }
1796     }
1797
1798   body_cost_vec.release ();
1799
1800   /* (2) Versioning to force alignment.  */
1801
1802   /* Try versioning if:
1803      1) optimize loop for speed
1804      2) there is at least one unsupported misaligned data ref with an unknown
1805         misalignment, and
1806      3) all misaligned data refs with a known misalignment are supported, and
1807      4) the number of runtime alignment checks is within reason.  */
1808
1809   do_versioning =
1810         optimize_loop_nest_for_speed_p (loop)
1811         && (!loop->inner); /* FORNOW */
1812
1813   if (do_versioning)
1814     {
1815       FOR_EACH_VEC_ELT (datarefs, i, dr)
1816         {
1817           stmt = DR_STMT (dr);
1818           stmt_info = vinfo_for_stmt (stmt);
1819
1820           /* For interleaving, only the alignment of the first access
1821              matters.  */
1822           if (aligned_access_p (dr)
1823               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1824                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1825             continue;
1826
1827           if (STMT_VINFO_STRIDED_P (stmt_info))
1828             {
1829               /* Strided loads perform only component accesses, alignment is
1830                  irrelevant for them.  */
1831               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1832                 continue;
1833               do_versioning = false;
1834               break;
1835             }
1836
1837           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1838
1839           if (!supportable_dr_alignment)
1840             {
1841               gimple stmt;
1842               int mask;
1843               tree vectype;
1844
1845               if (known_alignment_for_access_p (dr)
1846                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1847                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1848                 {
1849                   do_versioning = false;
1850                   break;
1851                 }
1852
1853               stmt = DR_STMT (dr);
1854               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1855               gcc_assert (vectype);
1856
1857               /* The rightmost bits of an aligned address must be zeros.
1858                  Construct the mask needed for this test.  For example,
1859                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1860                  mask must be 15 = 0xf. */
1861               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1862
1863               /* FORNOW: use the same mask to test all potentially unaligned
1864                  references in the loop.  The vectorizer currently supports
1865                  a single vector size, see the reference to
1866                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1867                  vectorization factor is computed.  */
1868               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1869                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1870               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1871               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1872                       DR_STMT (dr));
1873             }
1874         }
1875
1876       /* Versioning requires at least one misaligned data reference.  */
1877       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1878         do_versioning = false;
1879       else if (!do_versioning)
1880         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1881     }
1882
1883   if (do_versioning)
1884     {
1885       vec<gimple> may_misalign_stmts
1886         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1887       gimple stmt;
1888
1889       /* It can now be assumed that the data references in the statements
1890          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1891          of the loop being vectorized.  */
1892       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1893         {
1894           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1895           dr = STMT_VINFO_DATA_REF (stmt_info);
1896           SET_DR_MISALIGNMENT (dr, 0);
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_NOTE, vect_location,
1899                              "Alignment of access forced using versioning.\n");
1900         }
1901
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_NOTE, vect_location,
1904                          "Versioning for alignment will be applied.\n");
1905
1906       /* Peeling and versioning can't be done together at this time.  */
1907       gcc_assert (! (do_peeling && do_versioning));
1908
1909       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1910       gcc_assert (stat);
1911       return stat;
1912     }
1913
1914   /* This point is reached if neither peeling nor versioning is being done.  */
1915   gcc_assert (! (do_peeling || do_versioning));
1916
1917   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1918   return stat;
1919 }
1920
1921
1922 /* Function vect_find_same_alignment_drs.
1923
1924    Update group and alignment relations according to the chosen
1925    vectorization factor.  */
1926
1927 static void
1928 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1929                               loop_vec_info loop_vinfo)
1930 {
1931   unsigned int i;
1932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1933   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1934   struct data_reference *dra = DDR_A (ddr);
1935   struct data_reference *drb = DDR_B (ddr);
1936   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1937   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1938   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1939   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1940   lambda_vector dist_v;
1941   unsigned int loop_depth;
1942
1943   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1944     return;
1945
1946   if (dra == drb)
1947     return;
1948
1949   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1950     return;
1951
1952   /* Loop-based vectorization and known data dependence.  */
1953   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1954     return;
1955
1956   /* Data-dependence analysis reports a distance vector of zero
1957      for data-references that overlap only in the first iteration
1958      but have different sign step (see PR45764).
1959      So as a sanity check require equal DR_STEP.  */
1960   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1961     return;
1962
1963   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1964   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1965     {
1966       int dist = dist_v[loop_depth];
1967
1968       if (dump_enabled_p ())
1969         dump_printf_loc (MSG_NOTE, vect_location,
1970                          "dependence distance  = %d.\n", dist);
1971
1972       /* Same loop iteration.  */
1973       if (dist == 0
1974           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1975         {
1976           /* Two references with distance zero have the same alignment.  */
1977           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1978           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1979           if (dump_enabled_p ())
1980             {
1981               dump_printf_loc (MSG_NOTE, vect_location,
1982                                "accesses have the same alignment.\n");
1983               dump_printf (MSG_NOTE,
1984                            "dependence distance modulo vf == 0 between ");
1985               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1986               dump_printf (MSG_NOTE,  " and ");
1987               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1988               dump_printf (MSG_NOTE, "\n");
1989             }
1990         }
1991     }
1992 }
1993
1994
1995 /* Function vect_analyze_data_refs_alignment
1996
1997    Analyze the alignment of the data-references in the loop.
1998    Return FALSE if a data reference is found that cannot be vectorized.  */
1999
2000 bool
2001 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2002                                   bb_vec_info bb_vinfo)
2003 {
2004   if (dump_enabled_p ())
2005     dump_printf_loc (MSG_NOTE, vect_location,
2006                      "=== vect_analyze_data_refs_alignment ===\n");
2007
2008   /* Mark groups of data references with same alignment using
2009      data dependence information.  */
2010   if (loop_vinfo)
2011     {
2012       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2013       struct data_dependence_relation *ddr;
2014       unsigned int i;
2015
2016       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2017         vect_find_same_alignment_drs (ddr, loop_vinfo);
2018     }
2019
2020   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2021     {
2022       if (dump_enabled_p ())
2023         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2024                          "not vectorized: can't calculate alignment "
2025                          "for data ref.\n");
2026       return false;
2027     }
2028
2029   return true;
2030 }
2031
2032
2033 /* Analyze groups of accesses: check that DR belongs to a group of
2034    accesses of legal size, step, etc.  Detect gaps, single element
2035    interleaving, and other special cases. Set grouped access info.
2036    Collect groups of strided stores for further use in SLP analysis.  */
2037
2038 static bool
2039 vect_analyze_group_access (struct data_reference *dr)
2040 {
2041   tree step = DR_STEP (dr);
2042   tree scalar_type = TREE_TYPE (DR_REF (dr));
2043   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2044   gimple stmt = DR_STMT (dr);
2045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2047   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2048   HOST_WIDE_INT dr_step = -1;
2049   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2050   bool slp_impossible = false;
2051   struct loop *loop = NULL;
2052
2053   if (loop_vinfo)
2054     loop = LOOP_VINFO_LOOP (loop_vinfo);
2055
2056   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2057      size of the interleaving group (including gaps).  */
2058   if (tree_fits_shwi_p (step))
2059     {
2060       dr_step = tree_to_shwi (step);
2061       groupsize = absu_hwi (dr_step) / type_size;
2062     }
2063   else
2064     groupsize = 0;
2065
2066   /* Not consecutive access is possible only if it is a part of interleaving.  */
2067   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2068     {
2069       /* Check if it this DR is a part of interleaving, and is a single
2070          element of the group that is accessed in the loop.  */
2071
2072       /* Gaps are supported only for loads. STEP must be a multiple of the type
2073          size.  The size of the group must be a power of 2.  */
2074       if (DR_IS_READ (dr)
2075           && (dr_step % type_size) == 0
2076           && groupsize > 0
2077           && exact_log2 (groupsize) != -1)
2078         {
2079           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2080           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2081           if (dump_enabled_p ())
2082             {
2083               dump_printf_loc (MSG_NOTE, vect_location,
2084                                "Detected single element interleaving ");
2085               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2086               dump_printf (MSG_NOTE, " step ");
2087               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2088               dump_printf (MSG_NOTE, "\n");
2089             }
2090
2091           if (loop_vinfo)
2092             {
2093               if (dump_enabled_p ())
2094                 dump_printf_loc (MSG_NOTE, vect_location,
2095                                  "Data access with gaps requires scalar "
2096                                  "epilogue loop\n");
2097               if (loop->inner)
2098                 {
2099                   if (dump_enabled_p ())
2100                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101                                      "Peeling for outer loop is not"
2102                                      " supported\n");
2103                   return false;
2104                 }
2105
2106               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2107             }
2108
2109           return true;
2110         }
2111
2112       if (dump_enabled_p ())
2113         {
2114           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2115                            "not consecutive access ");
2116           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2117           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2118         }
2119
2120       if (bb_vinfo)
2121         {
2122           /* Mark the statement as unvectorizable.  */
2123           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2124           return true;
2125         }
2126
2127       return false;
2128     }
2129
2130   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2131     {
2132       /* First stmt in the interleaving chain. Check the chain.  */
2133       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2134       struct data_reference *data_ref = dr;
2135       unsigned int count = 1;
2136       tree prev_init = DR_INIT (data_ref);
2137       gimple prev = stmt;
2138       HOST_WIDE_INT diff, gaps = 0;
2139
2140       while (next)
2141         {
2142           /* Skip same data-refs.  In case that two or more stmts share
2143              data-ref (supported only for loads), we vectorize only the first
2144              stmt, and the rest get their vectorized loads from the first
2145              one.  */
2146           if (!tree_int_cst_compare (DR_INIT (data_ref),
2147                                      DR_INIT (STMT_VINFO_DATA_REF (
2148                                                    vinfo_for_stmt (next)))))
2149             {
2150               if (DR_IS_WRITE (data_ref))
2151                 {
2152                   if (dump_enabled_p ())
2153                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2154                                      "Two store stmts share the same dr.\n");
2155                   return false;
2156                 }
2157
2158               /* For load use the same data-ref load.  */
2159               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2160
2161               prev = next;
2162               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2163               continue;
2164             }
2165
2166           prev = next;
2167           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2168
2169           /* All group members have the same STEP by construction.  */
2170           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2171
2172           /* Check that the distance between two accesses is equal to the type
2173              size. Otherwise, we have gaps.  */
2174           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2175                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2176           if (diff != 1)
2177             {
2178               /* FORNOW: SLP of accesses with gaps is not supported.  */
2179               slp_impossible = true;
2180               if (DR_IS_WRITE (data_ref))
2181                 {
2182                   if (dump_enabled_p ())
2183                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184                                      "interleaved store with gaps\n");
2185                   return false;
2186                 }
2187
2188               gaps += diff - 1;
2189             }
2190
2191           last_accessed_element += diff;
2192
2193           /* Store the gap from the previous member of the group. If there is no
2194              gap in the access, GROUP_GAP is always 1.  */
2195           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2196
2197           prev_init = DR_INIT (data_ref);
2198           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2199           /* Count the number of data-refs in the chain.  */
2200           count++;
2201         }
2202
2203       if (groupsize == 0)
2204         groupsize = count + gaps;
2205
2206       /* Check that the size of the interleaving is equal to count for stores,
2207          i.e., that there are no gaps.  */
2208       if (groupsize != count)
2209         {
2210           if (DR_IS_READ (dr))
2211             {
2212               slp_impossible = true;
2213               /* There is a gap after the last load in the group. This gap is a
2214                  difference between the groupsize and the number of elements.
2215                  When there is no gap, this difference should be 0.  */
2216               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2217             }
2218           else
2219             {
2220               if (dump_enabled_p ())
2221                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222                                  "interleaved store with gaps\n");
2223               return false;
2224             }
2225         }
2226
2227       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2228       if (dump_enabled_p ())
2229         dump_printf_loc (MSG_NOTE, vect_location,
2230                          "Detected interleaving of size %d\n", (int)groupsize);
2231
2232       /* SLP: create an SLP data structure for every interleaving group of
2233          stores for further analysis in vect_analyse_slp.  */
2234       if (DR_IS_WRITE (dr) && !slp_impossible)
2235         {
2236           if (loop_vinfo)
2237             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2238           if (bb_vinfo)
2239             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2240         }
2241
2242       /* There is a gap in the end of the group.  */
2243       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2244         {
2245           if (dump_enabled_p ())
2246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2247                              "Data access with gaps requires scalar "
2248                              "epilogue loop\n");
2249           if (loop->inner)
2250             {
2251               if (dump_enabled_p ())
2252                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253                                  "Peeling for outer loop is not supported\n");
2254               return false;
2255             }
2256
2257           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2258         }
2259     }
2260
2261   return true;
2262 }
2263
2264
2265 /* Analyze the access pattern of the data-reference DR.
2266    In case of non-consecutive accesses call vect_analyze_group_access() to
2267    analyze groups of accesses.  */
2268
2269 static bool
2270 vect_analyze_data_ref_access (struct data_reference *dr)
2271 {
2272   tree step = DR_STEP (dr);
2273   tree scalar_type = TREE_TYPE (DR_REF (dr));
2274   gimple stmt = DR_STMT (dr);
2275   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2276   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2277   struct loop *loop = NULL;
2278
2279   if (loop_vinfo)
2280     loop = LOOP_VINFO_LOOP (loop_vinfo);
2281
2282   if (loop_vinfo && !step)
2283     {
2284       if (dump_enabled_p ())
2285         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2286                          "bad data-ref access in loop\n");
2287       return false;
2288     }
2289
2290   /* Allow invariant loads in not nested loops.  */
2291   if (loop_vinfo && integer_zerop (step))
2292     {
2293       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2294       if (nested_in_vect_loop_p (loop, stmt))
2295         {
2296           if (dump_enabled_p ())
2297             dump_printf_loc (MSG_NOTE, vect_location,
2298                              "zero step in inner loop of nest\n");
2299           return false;
2300         }
2301       return DR_IS_READ (dr);
2302     }
2303
2304   if (loop && nested_in_vect_loop_p (loop, stmt))
2305     {
2306       /* Interleaved accesses are not yet supported within outer-loop
2307         vectorization for references in the inner-loop.  */
2308       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2309
2310       /* For the rest of the analysis we use the outer-loop step.  */
2311       step = STMT_VINFO_DR_STEP (stmt_info);
2312       if (integer_zerop (step))
2313         {
2314           if (dump_enabled_p ())
2315             dump_printf_loc (MSG_NOTE, vect_location,
2316                              "zero step in outer loop.\n");
2317           if (DR_IS_READ (dr))
2318             return true;
2319           else
2320             return false;
2321         }
2322     }
2323
2324   /* Consecutive?  */
2325   if (TREE_CODE (step) == INTEGER_CST)
2326     {
2327       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2328       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2329           || (dr_step < 0
2330               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2331         {
2332           /* Mark that it is not interleaving.  */
2333           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2334           return true;
2335         }
2336     }
2337
2338   if (loop && nested_in_vect_loop_p (loop, stmt))
2339     {
2340       if (dump_enabled_p ())
2341         dump_printf_loc (MSG_NOTE, vect_location,
2342                          "grouped access in outer loop.\n");
2343       return false;
2344     }
2345
2346
2347   /* Assume this is a DR handled by non-constant strided load case.  */
2348   if (TREE_CODE (step) != INTEGER_CST)
2349     return (STMT_VINFO_STRIDED_P (stmt_info)
2350             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2351                 || vect_analyze_group_access (dr)));
2352
2353   /* Not consecutive access - check if it's a part of interleaving group.  */
2354   return vect_analyze_group_access (dr);
2355 }
2356
2357
2358
2359 /*  A helper function used in the comparator function to sort data
2360     references.  T1 and T2 are two data references to be compared.
2361     The function returns -1, 0, or 1.  */
2362
2363 static int
2364 compare_tree (tree t1, tree t2)
2365 {
2366   int i, cmp;
2367   enum tree_code code;
2368   char tclass;
2369
2370   if (t1 == t2)
2371     return 0;
2372   if (t1 == NULL)
2373     return -1;
2374   if (t2 == NULL)
2375     return 1;
2376
2377
2378   if (TREE_CODE (t1) != TREE_CODE (t2))
2379     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2380
2381   code = TREE_CODE (t1);
2382   switch (code)
2383     {
2384     /* For const values, we can just use hash values for comparisons.  */
2385     case INTEGER_CST:
2386     case REAL_CST:
2387     case FIXED_CST:
2388     case STRING_CST:
2389     case COMPLEX_CST:
2390     case VECTOR_CST:
2391       {
2392         hashval_t h1 = iterative_hash_expr (t1, 0);
2393         hashval_t h2 = iterative_hash_expr (t2, 0);
2394         if (h1 != h2)
2395           return h1 < h2 ? -1 : 1;
2396         break;
2397       }
2398
2399     case SSA_NAME:
2400       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2401       if (cmp != 0)
2402         return cmp;
2403
2404       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2405         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2406       break;
2407
2408     default:
2409       tclass = TREE_CODE_CLASS (code);
2410
2411       /* For var-decl, we could compare their UIDs.  */
2412       if (tclass == tcc_declaration)
2413         {
2414           if (DECL_UID (t1) != DECL_UID (t2))
2415             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2416           break;
2417         }
2418
2419       /* For expressions with operands, compare their operands recursively.  */
2420       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2421         {
2422           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2423           if (cmp != 0)
2424             return cmp;
2425         }
2426     }
2427
2428   return 0;
2429 }
2430
2431
2432 /* Compare two data-references DRA and DRB to group them into chunks
2433    suitable for grouping.  */
2434
2435 static int
2436 dr_group_sort_cmp (const void *dra_, const void *drb_)
2437 {
2438   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2439   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2440   int cmp;
2441
2442   /* Stabilize sort.  */
2443   if (dra == drb)
2444     return 0;
2445
2446   /* Ordering of DRs according to base.  */
2447   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2448     {
2449       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2450       if (cmp != 0)
2451         return cmp;
2452     }
2453
2454   /* And according to DR_OFFSET.  */
2455   if (!dr_equal_offsets_p (dra, drb))
2456     {
2457       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2458       if (cmp != 0)
2459         return cmp;
2460     }
2461
2462   /* Put reads before writes.  */
2463   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2464     return DR_IS_READ (dra) ? -1 : 1;
2465
2466   /* Then sort after access size.  */
2467   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2468                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2469     {
2470       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2471                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2472       if (cmp != 0)
2473         return cmp;
2474     }
2475
2476   /* And after step.  */
2477   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2478     {
2479       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2480       if (cmp != 0)
2481         return cmp;
2482     }
2483
2484   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2485   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2486   if (cmp == 0)
2487     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2488   return cmp;
2489 }
2490
2491 /* Function vect_analyze_data_ref_accesses.
2492
2493    Analyze the access pattern of all the data references in the loop.
2494
2495    FORNOW: the only access pattern that is considered vectorizable is a
2496            simple step 1 (consecutive) access.
2497
2498    FORNOW: handle only arrays and pointer accesses.  */
2499
2500 bool
2501 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2502 {
2503   unsigned int i;
2504   vec<data_reference_p> datarefs;
2505   struct data_reference *dr;
2506
2507   if (dump_enabled_p ())
2508     dump_printf_loc (MSG_NOTE, vect_location,
2509                      "=== vect_analyze_data_ref_accesses ===\n");
2510
2511   if (loop_vinfo)
2512     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2513   else
2514     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2515
2516   if (datarefs.is_empty ())
2517     return true;
2518
2519   /* Sort the array of datarefs to make building the interleaving chains
2520      linear.  Don't modify the original vector's order, it is needed for
2521      determining what dependencies are reversed.  */
2522   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2523   datarefs_copy.qsort (dr_group_sort_cmp);
2524
2525   /* Build the interleaving chains.  */
2526   for (i = 0; i < datarefs_copy.length () - 1;)
2527     {
2528       data_reference_p dra = datarefs_copy[i];
2529       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2530       stmt_vec_info lastinfo = NULL;
2531       for (i = i + 1; i < datarefs_copy.length (); ++i)
2532         {
2533           data_reference_p drb = datarefs_copy[i];
2534           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2535
2536           /* ???  Imperfect sorting (non-compatible types, non-modulo
2537              accesses, same accesses) can lead to a group to be artificially
2538              split here as we don't just skip over those.  If it really
2539              matters we can push those to a worklist and re-iterate
2540              over them.  The we can just skip ahead to the next DR here.  */
2541
2542           /* Check that the data-refs have same first location (except init)
2543              and they are both either store or load (not load and store,
2544              not masked loads or stores).  */
2545           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2546               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2547                                    DR_BASE_ADDRESS (drb), 0)
2548               || !dr_equal_offsets_p (dra, drb)
2549               || !gimple_assign_single_p (DR_STMT (dra))
2550               || !gimple_assign_single_p (DR_STMT (drb)))
2551             break;
2552
2553           /* Check that the data-refs have the same constant size.  */
2554           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2555           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2556           if (!tree_fits_uhwi_p (sza)
2557               || !tree_fits_uhwi_p (szb)
2558               || !tree_int_cst_equal (sza, szb))
2559             break;
2560
2561           /* Check that the data-refs have the same step.  */
2562           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2563             break;
2564
2565           /* Do not place the same access in the interleaving chain twice.  */
2566           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2567             break;
2568
2569           /* Check the types are compatible.
2570              ???  We don't distinguish this during sorting.  */
2571           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2572                                    TREE_TYPE (DR_REF (drb))))
2573             break;
2574
2575           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2576           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2577           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2578           gcc_assert (init_a < init_b);
2579
2580           /* If init_b == init_a + the size of the type * k, we have an
2581              interleaving, and DRA is accessed before DRB.  */
2582           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2583           if ((init_b - init_a) % type_size_a != 0)
2584             break;
2585
2586           /* If we have a store, the accesses are adjacent.  This splits
2587              groups into chunks we support (we don't support vectorization
2588              of stores with gaps).  */
2589           if (!DR_IS_READ (dra)
2590               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2591                                              (DR_INIT (datarefs_copy[i-1]))
2592                   != type_size_a))
2593             break;
2594
2595           /* If the step (if not zero or non-constant) is greater than the
2596              difference between data-refs' inits this splits groups into
2597              suitable sizes.  */
2598           if (tree_fits_shwi_p (DR_STEP (dra)))
2599             {
2600               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2601               if (step != 0 && step <= (init_b - init_a))
2602                 break;
2603             }
2604
2605           if (dump_enabled_p ())
2606             {
2607               dump_printf_loc (MSG_NOTE, vect_location,
2608                                "Detected interleaving ");
2609               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2610               dump_printf (MSG_NOTE,  " and ");
2611               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2612               dump_printf (MSG_NOTE, "\n");
2613             }
2614
2615           /* Link the found element into the group list.  */
2616           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2617             {
2618               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2619               lastinfo = stmtinfo_a;
2620             }
2621           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2622           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2623           lastinfo = stmtinfo_b;
2624         }
2625     }
2626
2627   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2628     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2629         && !vect_analyze_data_ref_access (dr))
2630       {
2631         if (dump_enabled_p ())
2632           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2633                            "not vectorized: complicated access pattern.\n");
2634
2635         if (bb_vinfo)
2636           {
2637             /* Mark the statement as not vectorizable.  */
2638             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2639             continue;
2640           }
2641         else
2642           {
2643             datarefs_copy.release ();
2644             return false;
2645           }
2646       }
2647
2648   datarefs_copy.release ();
2649   return true;
2650 }
2651
2652
2653 /* Operator == between two dr_with_seg_len objects.
2654
2655    This equality operator is used to make sure two data refs
2656    are the same one so that we will consider to combine the
2657    aliasing checks of those two pairs of data dependent data
2658    refs.  */
2659
2660 static bool
2661 operator == (const dr_with_seg_len& d1,
2662              const dr_with_seg_len& d2)
2663 {
2664   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2665                           DR_BASE_ADDRESS (d2.dr), 0)
2666            && compare_tree (d1.offset, d2.offset) == 0
2667            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2668 }
2669
2670 /* Function comp_dr_with_seg_len_pair.
2671
2672    Comparison function for sorting objects of dr_with_seg_len_pair_t
2673    so that we can combine aliasing checks in one scan.  */
2674
2675 static int
2676 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2677 {
2678   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2679   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2680
2681   const dr_with_seg_len &p11 = p1->first,
2682                         &p12 = p1->second,
2683                         &p21 = p2->first,
2684                         &p22 = p2->second;
2685
2686   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2687      if a and c have the same basic address snd step, and b and d have the same
2688      address and step.  Therefore, if any a&c or b&d don't have the same address
2689      and step, we don't care the order of those two pairs after sorting.  */
2690   int comp_res;
2691
2692   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2693                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2694     return comp_res;
2695   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2696                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2697     return comp_res;
2698   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2699     return comp_res;
2700   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2701     return comp_res;
2702   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2703     return comp_res;
2704   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2705     return comp_res;
2706
2707   return 0;
2708 }
2709
2710 /* Function vect_vfa_segment_size.
2711
2712    Create an expression that computes the size of segment
2713    that will be accessed for a data reference.  The functions takes into
2714    account that realignment loads may access one more vector.
2715
2716    Input:
2717      DR: The data reference.
2718      LENGTH_FACTOR: segment length to consider.
2719
2720    Return an expression whose value is the size of segment which will be
2721    accessed by DR.  */
2722
2723 static tree
2724 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2725 {
2726   tree segment_length;
2727
2728   if (integer_zerop (DR_STEP (dr)))
2729     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2730   else
2731     segment_length = size_binop (MULT_EXPR,
2732                                  fold_convert (sizetype, DR_STEP (dr)),
2733                                  fold_convert (sizetype, length_factor));
2734
2735   if (vect_supportable_dr_alignment (dr, false)
2736         == dr_explicit_realign_optimized)
2737     {
2738       tree vector_size = TYPE_SIZE_UNIT
2739                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2740
2741       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2742     }
2743   return segment_length;
2744 }
2745
2746 /* Function vect_prune_runtime_alias_test_list.
2747
2748    Prune a list of ddrs to be tested at run-time by versioning for alias.
2749    Merge several alias checks into one if possible.
2750    Return FALSE if resulting list of ddrs is longer then allowed by
2751    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2752
2753 bool
2754 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2755 {
2756   vec<ddr_p> may_alias_ddrs =
2757     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2758   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2759     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2760   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2761   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2762
2763   ddr_p ddr;
2764   unsigned int i;
2765   tree length_factor;
2766
2767   if (dump_enabled_p ())
2768     dump_printf_loc (MSG_NOTE, vect_location,
2769                      "=== vect_prune_runtime_alias_test_list ===\n");
2770
2771   if (may_alias_ddrs.is_empty ())
2772     return true;
2773
2774   /* Basically, for each pair of dependent data refs store_ptr_0
2775      and load_ptr_0, we create an expression:
2776
2777      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2778      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2779
2780      for aliasing checks.  However, in some cases we can decrease
2781      the number of checks by combining two checks into one.  For
2782      example, suppose we have another pair of data refs store_ptr_0
2783      and load_ptr_1, and if the following condition is satisfied:
2784
2785      load_ptr_0 < load_ptr_1  &&
2786      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2787
2788      (this condition means, in each iteration of vectorized loop,
2789      the accessed memory of store_ptr_0 cannot be between the memory
2790      of load_ptr_0 and load_ptr_1.)
2791
2792      we then can use only the following expression to finish the
2793      alising checks between store_ptr_0 & load_ptr_0 and
2794      store_ptr_0 & load_ptr_1:
2795
2796      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2797      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2798
2799      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2800      same basic address.  */
2801
2802   comp_alias_ddrs.create (may_alias_ddrs.length ());
2803
2804   /* First, we collect all data ref pairs for aliasing checks.  */
2805   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2806     {
2807       struct data_reference *dr_a, *dr_b;
2808       gimple dr_group_first_a, dr_group_first_b;
2809       tree segment_length_a, segment_length_b;
2810       gimple stmt_a, stmt_b;
2811
2812       dr_a = DDR_A (ddr);
2813       stmt_a = DR_STMT (DDR_A (ddr));
2814       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2815       if (dr_group_first_a)
2816         {
2817           stmt_a = dr_group_first_a;
2818           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2819         }
2820
2821       dr_b = DDR_B (ddr);
2822       stmt_b = DR_STMT (DDR_B (ddr));
2823       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2824       if (dr_group_first_b)
2825         {
2826           stmt_b = dr_group_first_b;
2827           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2828         }
2829
2830       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2831         length_factor = scalar_loop_iters;
2832       else
2833         length_factor = size_int (vect_factor);
2834       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2835       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2836
2837       dr_with_seg_len_pair_t dr_with_seg_len_pair
2838           (dr_with_seg_len (dr_a, segment_length_a),
2839            dr_with_seg_len (dr_b, segment_length_b));
2840
2841       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2842         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2843
2844       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2845     }
2846
2847   /* Second, we sort the collected data ref pairs so that we can scan
2848      them once to combine all possible aliasing checks.  */
2849   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2850
2851   /* Third, we scan the sorted dr pairs and check if we can combine
2852      alias checks of two neighbouring dr pairs.  */
2853   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2854     {
2855       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2856       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2857                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2858                       *dr_a2 = &comp_alias_ddrs[i].first,
2859                       *dr_b2 = &comp_alias_ddrs[i].second;
2860
2861       /* Remove duplicate data ref pairs.  */
2862       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2863         {
2864           if (dump_enabled_p ())
2865             {
2866               dump_printf_loc (MSG_NOTE, vect_location,
2867                                "found equal ranges ");
2868               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2869                                  DR_REF (dr_a1->dr));
2870               dump_printf (MSG_NOTE,  ", ");
2871               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2872                                  DR_REF (dr_b1->dr));
2873               dump_printf (MSG_NOTE,  " and ");
2874               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2875                                  DR_REF (dr_a2->dr));
2876               dump_printf (MSG_NOTE,  ", ");
2877               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2878                                  DR_REF (dr_b2->dr));
2879               dump_printf (MSG_NOTE, "\n");
2880             }
2881
2882           comp_alias_ddrs.ordered_remove (i--);
2883           continue;
2884         }
2885
2886       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2887         {
2888           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2889              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2890           if (*dr_a1 == *dr_a2)
2891             {
2892               std::swap (dr_a1, dr_b1);
2893               std::swap (dr_a2, dr_b2);
2894             }
2895
2896           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2897                                 DR_BASE_ADDRESS (dr_a2->dr),
2898                                 0)
2899               || !tree_fits_shwi_p (dr_a1->offset)
2900               || !tree_fits_shwi_p (dr_a2->offset))
2901             continue;
2902
2903           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2904                                 - tree_to_shwi (dr_a1->offset));
2905
2906
2907           /* Now we check if the following condition is satisfied:
2908
2909              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2910
2911              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2912              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2913              have to make a best estimation.  We can get the minimum value
2914              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2915              then either of the following two conditions can guarantee the
2916              one above:
2917
2918              1: DIFF <= MIN_SEG_LEN_B
2919              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2920
2921              */
2922
2923           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2924                                           ? tree_to_shwi (dr_b1->seg_len)
2925                                           : vect_factor);
2926
2927           if (diff <= min_seg_len_b
2928               || (tree_fits_shwi_p (dr_a1->seg_len)
2929                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2930             {
2931               if (dump_enabled_p ())
2932                 {
2933                   dump_printf_loc (MSG_NOTE, vect_location,
2934                                    "merging ranges for ");
2935                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2936                                      DR_REF (dr_a1->dr));
2937                   dump_printf (MSG_NOTE,  ", ");
2938                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2939                                      DR_REF (dr_b1->dr));
2940                   dump_printf (MSG_NOTE,  " and ");
2941                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2942                                      DR_REF (dr_a2->dr));
2943                   dump_printf (MSG_NOTE,  ", ");
2944                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2945                                      DR_REF (dr_b2->dr));
2946                   dump_printf (MSG_NOTE, "\n");
2947                 }
2948
2949               dr_a1->seg_len = size_binop (PLUS_EXPR,
2950                                            dr_a2->seg_len, size_int (diff));
2951               comp_alias_ddrs.ordered_remove (i--);
2952             }
2953         }
2954     }
2955
2956   dump_printf_loc (MSG_NOTE, vect_location,
2957                    "improved number of alias checks from %d to %d\n",
2958                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2959   if ((int) comp_alias_ddrs.length () >
2960       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2961     return false;
2962
2963   return true;
2964 }
2965
2966 /* Check whether a non-affine read in stmt is suitable for gather load
2967    and if so, return a builtin decl for that operation.  */
2968
2969 tree
2970 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2971                    tree *offp, int *scalep)
2972 {
2973   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2974   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2975   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2976   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2977   tree offtype = NULL_TREE;
2978   tree decl, base, off;
2979   machine_mode pmode;
2980   int punsignedp, pvolatilep;
2981
2982   base = DR_REF (dr);
2983   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2984      see if we can use the def stmt of the address.  */
2985   if (is_gimple_call (stmt)
2986       && gimple_call_internal_p (stmt)
2987       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2988           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2989       && TREE_CODE (base) == MEM_REF
2990       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2991       && integer_zerop (TREE_OPERAND (base, 1))
2992       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2993     {
2994       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2995       if (is_gimple_assign (def_stmt)
2996           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2997         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2998     }
2999
3000   /* The gather builtins need address of the form
3001      loop_invariant + vector * {1, 2, 4, 8}
3002      or
3003      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3004      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3005      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3006      multiplications and additions in it.  To get a vector, we need
3007      a single SSA_NAME that will be defined in the loop and will
3008      contain everything that is not loop invariant and that can be
3009      vectorized.  The following code attempts to find such a preexistng
3010      SSA_NAME OFF and put the loop invariants into a tree BASE
3011      that can be gimplified before the loop.  */
3012   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3013                               &pmode, &punsignedp, &pvolatilep, false);
3014   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3015
3016   if (TREE_CODE (base) == MEM_REF)
3017     {
3018       if (!integer_zerop (TREE_OPERAND (base, 1)))
3019         {
3020           if (off == NULL_TREE)
3021             {
3022               offset_int moff = mem_ref_offset (base);
3023               off = wide_int_to_tree (sizetype, moff);
3024             }
3025           else
3026             off = size_binop (PLUS_EXPR, off,
3027                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3028         }
3029       base = TREE_OPERAND (base, 0);
3030     }
3031   else
3032     base = build_fold_addr_expr (base);
3033
3034   if (off == NULL_TREE)
3035     off = size_zero_node;
3036
3037   /* If base is not loop invariant, either off is 0, then we start with just
3038      the constant offset in the loop invariant BASE and continue with base
3039      as OFF, otherwise give up.
3040      We could handle that case by gimplifying the addition of base + off
3041      into some SSA_NAME and use that as off, but for now punt.  */
3042   if (!expr_invariant_in_loop_p (loop, base))
3043     {
3044       if (!integer_zerop (off))
3045         return NULL_TREE;
3046       off = base;
3047       base = size_int (pbitpos / BITS_PER_UNIT);
3048     }
3049   /* Otherwise put base + constant offset into the loop invariant BASE
3050      and continue with OFF.  */
3051   else
3052     {
3053       base = fold_convert (sizetype, base);
3054       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3055     }
3056
3057   /* OFF at this point may be either a SSA_NAME or some tree expression
3058      from get_inner_reference.  Try to peel off loop invariants from it
3059      into BASE as long as possible.  */
3060   STRIP_NOPS (off);
3061   while (offtype == NULL_TREE)
3062     {
3063       enum tree_code code;
3064       tree op0, op1, add = NULL_TREE;
3065
3066       if (TREE_CODE (off) == SSA_NAME)
3067         {
3068           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3069
3070           if (expr_invariant_in_loop_p (loop, off))
3071             return NULL_TREE;
3072
3073           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3074             break;
3075
3076           op0 = gimple_assign_rhs1 (def_stmt);
3077           code = gimple_assign_rhs_code (def_stmt);
3078           op1 = gimple_assign_rhs2 (def_stmt);
3079         }
3080       else
3081         {
3082           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3083             return NULL_TREE;
3084           code = TREE_CODE (off);
3085           extract_ops_from_tree (off, &code, &op0, &op1);
3086         }
3087       switch (code)
3088         {
3089         case POINTER_PLUS_EXPR:
3090         case PLUS_EXPR:
3091           if (expr_invariant_in_loop_p (loop, op0))
3092             {
3093               add = op0;
3094               off = op1;
3095             do_add:
3096               add = fold_convert (sizetype, add);
3097               if (scale != 1)
3098                 add = size_binop (MULT_EXPR, add, size_int (scale));
3099               base = size_binop (PLUS_EXPR, base, add);
3100               continue;
3101             }
3102           if (expr_invariant_in_loop_p (loop, op1))
3103             {
3104               add = op1;
3105               off = op0;
3106               goto do_add;
3107             }
3108           break;
3109         case MINUS_EXPR:
3110           if (expr_invariant_in_loop_p (loop, op1))
3111             {
3112               add = fold_convert (sizetype, op1);
3113               add = size_binop (MINUS_EXPR, size_zero_node, add);
3114               off = op0;
3115               goto do_add;
3116             }
3117           break;
3118         case MULT_EXPR:
3119           if (scale == 1 && tree_fits_shwi_p (op1))
3120             {
3121               scale = tree_to_shwi (op1);
3122               off = op0;
3123               continue;
3124             }
3125           break;
3126         case SSA_NAME:
3127           off = op0;
3128           continue;
3129         CASE_CONVERT:
3130           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3131               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3132             break;
3133           if (TYPE_PRECISION (TREE_TYPE (op0))
3134               == TYPE_PRECISION (TREE_TYPE (off)))
3135             {
3136               off = op0;
3137               continue;
3138             }
3139           if (TYPE_PRECISION (TREE_TYPE (op0))
3140               < TYPE_PRECISION (TREE_TYPE (off)))
3141             {
3142               off = op0;
3143               offtype = TREE_TYPE (off);
3144               STRIP_NOPS (off);
3145               continue;
3146             }
3147           break;
3148         default:
3149           break;
3150         }
3151       break;
3152     }
3153
3154   /* If at the end OFF still isn't a SSA_NAME or isn't
3155      defined in the loop, punt.  */
3156   if (TREE_CODE (off) != SSA_NAME
3157       || expr_invariant_in_loop_p (loop, off))
3158     return NULL_TREE;
3159
3160   if (offtype == NULL_TREE)
3161     offtype = TREE_TYPE (off);
3162
3163   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3164                                            offtype, scale);
3165   if (decl == NULL_TREE)
3166     return NULL_TREE;
3167
3168   if (basep)
3169     *basep = base;
3170   if (offp)
3171     *offp = off;
3172   if (scalep)
3173     *scalep = scale;
3174   return decl;
3175 }
3176
3177 /* Function vect_analyze_data_refs.
3178
3179   Find all the data references in the loop or basic block.
3180
3181    The general structure of the analysis of data refs in the vectorizer is as
3182    follows:
3183    1- vect_analyze_data_refs(loop/bb): call
3184       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3185       in the loop/bb and their dependences.
3186    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3187    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3188    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3189
3190 */
3191
3192 bool
3193 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3194                         bb_vec_info bb_vinfo,
3195                         int *min_vf, unsigned *n_stmts)
3196 {
3197   struct loop *loop = NULL;
3198   basic_block bb = NULL;
3199   unsigned int i;
3200   vec<data_reference_p> datarefs;
3201   struct data_reference *dr;
3202   tree scalar_type;
3203
3204   if (dump_enabled_p ())
3205     dump_printf_loc (MSG_NOTE, vect_location,
3206                      "=== vect_analyze_data_refs ===\n");
3207
3208   if (loop_vinfo)
3209     {
3210       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3211
3212       loop = LOOP_VINFO_LOOP (loop_vinfo);
3213       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3214       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3215         {
3216           if (dump_enabled_p ())
3217             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3218                              "not vectorized: loop contains function calls"
3219                              " or data references that cannot be analyzed\n");
3220           return false;
3221         }
3222
3223       for (i = 0; i < loop->num_nodes; i++)
3224         {
3225           gimple_stmt_iterator gsi;
3226
3227           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3228             {
3229               gimple stmt = gsi_stmt (gsi);
3230               if (is_gimple_debug (stmt))
3231                 continue;
3232               ++*n_stmts;
3233               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3234                 {
3235                   if (is_gimple_call (stmt) && loop->safelen)
3236                     {
3237                       tree fndecl = gimple_call_fndecl (stmt), op;
3238                       if (fndecl != NULL_TREE)
3239                         {
3240                           struct cgraph_node *node = cgraph_node::get (fndecl);
3241                           if (node != NULL && node->simd_clones != NULL)
3242                             {
3243                               unsigned int j, n = gimple_call_num_args (stmt);
3244                               for (j = 0; j < n; j++)
3245                                 {
3246                                   op = gimple_call_arg (stmt, j);
3247                                   if (DECL_P (op)
3248                                       || (REFERENCE_CLASS_P (op)
3249                                           && get_base_address (op)))
3250                                     break;
3251                                 }
3252                               op = gimple_call_lhs (stmt);
3253                               /* Ignore #pragma omp declare simd functions
3254                                  if they don't have data references in the
3255                                  call stmt itself.  */
3256                               if (j == n
3257                                   && !(op
3258                                        && (DECL_P (op)
3259                                            || (REFERENCE_CLASS_P (op)
3260                                                && get_base_address (op)))))
3261                                 continue;
3262                             }
3263                         }
3264                     }
3265                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3266                   if (dump_enabled_p ())
3267                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3268                                      "not vectorized: loop contains function "
3269                                      "calls or data references that cannot "
3270                                      "be analyzed\n");
3271                   return false;
3272                 }
3273             }
3274         }
3275
3276       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3277     }
3278   else
3279     {
3280       gimple_stmt_iterator gsi;
3281
3282       bb = BB_VINFO_BB (bb_vinfo);
3283       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3284         {
3285           gimple stmt = gsi_stmt (gsi);
3286           if (is_gimple_debug (stmt))
3287             continue;
3288           ++*n_stmts;
3289           if (!find_data_references_in_stmt (NULL, stmt,
3290                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3291             {
3292               /* Mark the rest of the basic-block as unvectorizable.  */
3293               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3294                 {
3295                   stmt = gsi_stmt (gsi);
3296                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3297                 }
3298               break;
3299             }
3300         }
3301
3302       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3303     }
3304
3305   /* Go through the data-refs, check that the analysis succeeded.  Update
3306      pointer from stmt_vec_info struct to DR and vectype.  */
3307
3308   FOR_EACH_VEC_ELT (datarefs, i, dr)
3309     {
3310       gimple stmt;
3311       stmt_vec_info stmt_info;
3312       tree base, offset, init;
3313       bool gather = false;
3314       bool simd_lane_access = false;
3315       int vf;
3316
3317 again:
3318       if (!dr || !DR_REF (dr))
3319         {
3320           if (dump_enabled_p ())
3321             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322                              "not vectorized: unhandled data-ref\n");
3323           return false;
3324         }
3325
3326       stmt = DR_STMT (dr);
3327       stmt_info = vinfo_for_stmt (stmt);
3328
3329       /* Discard clobbers from the dataref vector.  We will remove
3330          clobber stmts during vectorization.  */
3331       if (gimple_clobber_p (stmt))
3332         {
3333           free_data_ref (dr);
3334           if (i == datarefs.length () - 1)
3335             {
3336               datarefs.pop ();
3337               break;
3338             }
3339           datarefs.ordered_remove (i);
3340           dr = datarefs[i];
3341           goto again;
3342         }
3343
3344       /* Check that analysis of the data-ref succeeded.  */
3345       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3346           || !DR_STEP (dr))
3347         {
3348           bool maybe_gather
3349             = DR_IS_READ (dr)
3350               && !TREE_THIS_VOLATILE (DR_REF (dr))
3351               && targetm.vectorize.builtin_gather != NULL;
3352           bool maybe_simd_lane_access
3353             = loop_vinfo && loop->simduid;
3354
3355           /* If target supports vector gather loads, or if this might be
3356              a SIMD lane access, see if they can't be used.  */
3357           if (loop_vinfo
3358               && (maybe_gather || maybe_simd_lane_access)
3359               && !nested_in_vect_loop_p (loop, stmt))
3360             {
3361               struct data_reference *newdr
3362                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3363                                    DR_REF (dr), stmt, true);
3364               gcc_assert (newdr != NULL && DR_REF (newdr));
3365               if (DR_BASE_ADDRESS (newdr)
3366                   && DR_OFFSET (newdr)
3367                   && DR_INIT (newdr)
3368                   && DR_STEP (newdr)
3369                   && integer_zerop (DR_STEP (newdr)))
3370                 {
3371                   if (maybe_simd_lane_access)
3372                     {
3373                       tree off = DR_OFFSET (newdr);
3374                       STRIP_NOPS (off);
3375                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3376                           && TREE_CODE (off) == MULT_EXPR
3377                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3378                         {
3379                           tree step = TREE_OPERAND (off, 1);
3380                           off = TREE_OPERAND (off, 0);
3381                           STRIP_NOPS (off);
3382                           if (CONVERT_EXPR_P (off)
3383                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3384                                                                           0)))
3385                                  < TYPE_PRECISION (TREE_TYPE (off)))
3386                             off = TREE_OPERAND (off, 0);
3387                           if (TREE_CODE (off) == SSA_NAME)
3388                             {
3389                               gimple def = SSA_NAME_DEF_STMT (off);
3390                               tree reft = TREE_TYPE (DR_REF (newdr));
3391                               if (is_gimple_call (def)
3392                                   && gimple_call_internal_p (def)
3393                                   && (gimple_call_internal_fn (def)
3394                                       == IFN_GOMP_SIMD_LANE))
3395                                 {
3396                                   tree arg = gimple_call_arg (def, 0);
3397                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3398                                   arg = SSA_NAME_VAR (arg);
3399                                   if (arg == loop->simduid
3400                                       /* For now.  */
3401                                       && tree_int_cst_equal
3402                                            (TYPE_SIZE_UNIT (reft),
3403                                             step))
3404                                     {
3405                                       DR_OFFSET (newdr) = ssize_int (0);
3406                                       DR_STEP (newdr) = step;
3407                                       DR_ALIGNED_TO (newdr)
3408                                         = size_int (BIGGEST_ALIGNMENT);
3409                                       dr = newdr;
3410                                       simd_lane_access = true;
3411                                     }
3412                                 }
3413                             }
3414                         }
3415                     }
3416                   if (!simd_lane_access && maybe_gather)
3417                     {
3418                       dr = newdr;
3419                       gather = true;
3420                     }
3421                 }
3422               if (!gather && !simd_lane_access)
3423                 free_data_ref (newdr);
3424             }
3425
3426           if (!gather && !simd_lane_access)
3427             {
3428               if (dump_enabled_p ())
3429                 {
3430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3431                                    "not vectorized: data ref analysis "
3432                                    "failed ");
3433                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3434                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3435                 }
3436
3437               if (bb_vinfo)
3438                 break;
3439
3440               return false;
3441             }
3442         }
3443
3444       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3445         {
3446           if (dump_enabled_p ())
3447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3448                              "not vectorized: base addr of dr is a "
3449                              "constant\n");
3450
3451           if (bb_vinfo)
3452             break;
3453
3454           if (gather || simd_lane_access)
3455             free_data_ref (dr);
3456           return false;
3457         }
3458
3459       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3460         {
3461           if (dump_enabled_p ())
3462             {
3463               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3464                                "not vectorized: volatile type ");
3465               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3466               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3467             }
3468
3469           if (bb_vinfo)
3470             break;
3471
3472           return false;
3473         }
3474
3475       if (stmt_can_throw_internal (stmt))
3476         {
3477           if (dump_enabled_p ())
3478             {
3479               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480                                "not vectorized: statement can throw an "
3481                                "exception ");
3482               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3483               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3484             }
3485
3486           if (bb_vinfo)
3487             break;
3488
3489           if (gather || simd_lane_access)
3490             free_data_ref (dr);
3491           return false;
3492         }
3493
3494       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3495           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3496         {
3497           if (dump_enabled_p ())
3498             {
3499               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3500                                "not vectorized: statement is bitfield "
3501                                "access ");
3502               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3503               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3504             }
3505
3506           if (bb_vinfo)
3507             break;
3508
3509           if (gather || simd_lane_access)
3510             free_data_ref (dr);
3511           return false;
3512         }
3513
3514       base = unshare_expr (DR_BASE_ADDRESS (dr));
3515       offset = unshare_expr (DR_OFFSET (dr));
3516       init = unshare_expr (DR_INIT (dr));
3517
3518       if (is_gimple_call (stmt)
3519           && (!gimple_call_internal_p (stmt)
3520               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3521                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3522         {
3523           if (dump_enabled_p ())
3524             {
3525               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3526                                "not vectorized: dr in a call ");
3527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3529             }
3530
3531           if (bb_vinfo)
3532             break;
3533
3534           if (gather || simd_lane_access)
3535             free_data_ref (dr);
3536           return false;
3537         }
3538
3539       /* Update DR field in stmt_vec_info struct.  */
3540
3541       /* If the dataref is in an inner-loop of the loop that is considered for
3542          for vectorization, we also want to analyze the access relative to
3543          the outer-loop (DR contains information only relative to the
3544          inner-most enclosing loop).  We do that by building a reference to the
3545          first location accessed by the inner-loop, and analyze it relative to
3546          the outer-loop.  */
3547       if (loop && nested_in_vect_loop_p (loop, stmt))
3548         {
3549           tree outer_step, outer_base, outer_init;
3550           HOST_WIDE_INT pbitsize, pbitpos;
3551           tree poffset;
3552           machine_mode pmode;
3553           int punsignedp, pvolatilep;
3554           affine_iv base_iv, offset_iv;
3555           tree dinit;
3556
3557           /* Build a reference to the first location accessed by the
3558              inner-loop: *(BASE+INIT).  (The first location is actually
3559              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3560           tree inner_base = build_fold_indirect_ref
3561                                 (fold_build_pointer_plus (base, init));
3562
3563           if (dump_enabled_p ())
3564             {
3565               dump_printf_loc (MSG_NOTE, vect_location,
3566                                "analyze in outer-loop: ");
3567               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3568               dump_printf (MSG_NOTE, "\n");
3569             }
3570
3571           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3572                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3573           gcc_assert (outer_base != NULL_TREE);
3574
3575           if (pbitpos % BITS_PER_UNIT != 0)
3576             {
3577               if (dump_enabled_p ())
3578                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3579                                  "failed: bit offset alignment.\n");
3580               return false;
3581             }
3582
3583           outer_base = build_fold_addr_expr (outer_base);
3584           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3585                           &base_iv, false))
3586             {
3587               if (dump_enabled_p ())
3588                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589                                  "failed: evolution of base is not affine.\n");
3590               return false;
3591             }
3592
3593           if (offset)
3594             {
3595               if (poffset)
3596                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3597                                        poffset);
3598               else
3599                 poffset = offset;
3600             }
3601
3602           if (!poffset)
3603             {
3604               offset_iv.base = ssize_int (0);
3605               offset_iv.step = ssize_int (0);
3606             }
3607           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3608                                &offset_iv, false))
3609             {
3610               if (dump_enabled_p ())
3611                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3612                                  "evolution of offset is not affine.\n");
3613               return false;
3614             }
3615
3616           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3617           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3618           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3619           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3620           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3621
3622           outer_step = size_binop (PLUS_EXPR,
3623                                 fold_convert (ssizetype, base_iv.step),
3624                                 fold_convert (ssizetype, offset_iv.step));
3625
3626           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3627           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3628           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3629           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3630           STMT_VINFO_DR_OFFSET (stmt_info) =
3631                                 fold_convert (ssizetype, offset_iv.base);
3632           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3633                                 size_int (highest_pow2_factor (offset_iv.base));
3634
3635           if (dump_enabled_p ())
3636             {
3637               dump_printf_loc (MSG_NOTE, vect_location,
3638                                "\touter base_address: ");
3639               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3640                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3641               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3642               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3643                                  STMT_VINFO_DR_OFFSET (stmt_info));
3644               dump_printf (MSG_NOTE,
3645                            "\n\touter constant offset from base address: ");
3646               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3647                                  STMT_VINFO_DR_INIT (stmt_info));
3648               dump_printf (MSG_NOTE, "\n\touter step: ");
3649               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650                                  STMT_VINFO_DR_STEP (stmt_info));
3651               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3652               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3654               dump_printf (MSG_NOTE, "\n");
3655             }
3656         }
3657
3658       if (STMT_VINFO_DATA_REF (stmt_info))
3659         {
3660           if (dump_enabled_p ())
3661             {
3662               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3663                                "not vectorized: more than one data ref "
3664                                "in stmt: ");
3665               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3666               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3667             }
3668
3669           if (bb_vinfo)
3670             break;
3671
3672           if (gather || simd_lane_access)
3673             free_data_ref (dr);
3674           return false;
3675         }
3676
3677       STMT_VINFO_DATA_REF (stmt_info) = dr;
3678       if (simd_lane_access)
3679         {
3680           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3681           free_data_ref (datarefs[i]);
3682           datarefs[i] = dr;
3683         }
3684
3685       /* Set vectype for STMT.  */
3686       scalar_type = TREE_TYPE (DR_REF (dr));
3687       STMT_VINFO_VECTYPE (stmt_info)
3688         = get_vectype_for_scalar_type (scalar_type);
3689       if (!STMT_VINFO_VECTYPE (stmt_info))
3690         {
3691           if (dump_enabled_p ())
3692             {
3693               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3694                                "not vectorized: no vectype for stmt: ");
3695               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3696               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3697               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3698                                  scalar_type);
3699               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3700             }
3701
3702           if (bb_vinfo)
3703             break;
3704
3705           if (gather || simd_lane_access)
3706             {
3707               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3708               if (gather)
3709                 free_data_ref (dr);
3710             }
3711           return false;
3712         }
3713       else
3714         {
3715           if (dump_enabled_p ())
3716             {
3717               dump_printf_loc (MSG_NOTE, vect_location,
3718                                "got vectype for stmt: ");
3719               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3720               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3721                                  STMT_VINFO_VECTYPE (stmt_info));
3722               dump_printf (MSG_NOTE, "\n");
3723             }
3724         }
3725
3726       /* Adjust the minimal vectorization factor according to the
3727          vector type.  */
3728       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3729       if (vf > *min_vf)
3730         *min_vf = vf;
3731
3732       if (gather)
3733         {
3734           tree off;
3735
3736           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3737           if (gather
3738               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3739             gather = false;
3740           if (!gather)
3741             {
3742               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3743               free_data_ref (dr);
3744               if (dump_enabled_p ())
3745                 {
3746                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3747                                    "not vectorized: not suitable for gather "
3748                                    "load ");
3749                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3750                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3751                 }
3752               return false;
3753             }
3754
3755           datarefs[i] = dr;
3756           STMT_VINFO_GATHER_P (stmt_info) = true;
3757         }
3758       else if (loop_vinfo
3759                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3760         {
3761           if (nested_in_vect_loop_p (loop, stmt))
3762             {
3763               if (dump_enabled_p ())
3764                 {
3765                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3766                                    "not vectorized: not suitable for strided "
3767                                    "load ");
3768                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3769                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3770                 }
3771               return false;
3772             }
3773           STMT_VINFO_STRIDED_P (stmt_info) = true;
3774         }
3775     }
3776
3777   /* If we stopped analysis at the first dataref we could not analyze
3778      when trying to vectorize a basic-block mark the rest of the datarefs
3779      as not vectorizable and truncate the vector of datarefs.  That
3780      avoids spending useless time in analyzing their dependence.  */
3781   if (i != datarefs.length ())
3782     {
3783       gcc_assert (bb_vinfo != NULL);
3784       for (unsigned j = i; j < datarefs.length (); ++j)
3785         {
3786           data_reference_p dr = datarefs[j];
3787           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3788           free_data_ref (dr);
3789         }
3790       datarefs.truncate (i);
3791     }
3792
3793   return true;
3794 }
3795
3796
3797 /* Function vect_get_new_vect_var.
3798
3799    Returns a name for a new variable.  The current naming scheme appends the
3800    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3801    the name of vectorizer generated variables, and appends that to NAME if
3802    provided.  */
3803
3804 tree
3805 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3806 {
3807   const char *prefix;
3808   tree new_vect_var;
3809
3810   switch (var_kind)
3811   {
3812   case vect_simple_var:
3813     prefix = "vect";
3814     break;
3815   case vect_scalar_var:
3816     prefix = "stmp";
3817     break;
3818   case vect_pointer_var:
3819     prefix = "vectp";
3820     break;
3821   default:
3822     gcc_unreachable ();
3823   }
3824
3825   if (name)
3826     {
3827       char* tmp = concat (prefix, "_", name, NULL);
3828       new_vect_var = create_tmp_reg (type, tmp);
3829       free (tmp);
3830     }
3831   else
3832     new_vect_var = create_tmp_reg (type, prefix);
3833
3834   return new_vect_var;
3835 }
3836
3837 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3838
3839 static void
3840 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3841                                   stmt_vec_info stmt_info)
3842 {
3843   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3844   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3845   int misalign = DR_MISALIGNMENT (dr);
3846   if (misalign == -1)
3847     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3848   else
3849     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3850 }
3851
3852 /* Function vect_create_addr_base_for_vector_ref.
3853
3854    Create an expression that computes the address of the first memory location
3855    that will be accessed for a data reference.
3856
3857    Input:
3858    STMT: The statement containing the data reference.
3859    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3860    OFFSET: Optional. If supplied, it is be added to the initial address.
3861    LOOP:    Specify relative to which loop-nest should the address be computed.
3862             For example, when the dataref is in an inner-loop nested in an
3863             outer-loop that is now being vectorized, LOOP can be either the
3864             outer-loop, or the inner-loop.  The first memory location accessed
3865             by the following dataref ('in' points to short):
3866
3867                 for (i=0; i<N; i++)
3868                    for (j=0; j<M; j++)
3869                      s += in[i+j]
3870
3871             is as follows:
3872             if LOOP=i_loop:     &in             (relative to i_loop)
3873             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3874    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3875             initial address.  Unlike OFFSET, which is number of elements to
3876             be added, BYTE_OFFSET is measured in bytes.
3877
3878    Output:
3879    1. Return an SSA_NAME whose value is the address of the memory location of
3880       the first vector of the data reference.
3881    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3882       these statement(s) which define the returned SSA_NAME.
3883
3884    FORNOW: We are only handling array accesses with step 1.  */
3885
3886 tree
3887 vect_create_addr_base_for_vector_ref (gimple stmt,
3888                                       gimple_seq *new_stmt_list,
3889                                       tree offset,
3890                                       struct loop *loop,
3891                                       tree byte_offset)
3892 {
3893   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3894   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3895   tree data_ref_base;
3896   const char *base_name;
3897   tree addr_base;
3898   tree dest;
3899   gimple_seq seq = NULL;
3900   tree base_offset;
3901   tree init;
3902   tree vect_ptr_type;
3903   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3904   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3905
3906   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3907     {
3908       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3909
3910       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3911
3912       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3913       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3914       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3915     }
3916   else
3917     {
3918       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3919       base_offset = unshare_expr (DR_OFFSET (dr));
3920       init = unshare_expr (DR_INIT (dr));
3921     }
3922
3923   if (loop_vinfo)
3924     base_name = get_name (data_ref_base);
3925   else
3926     {
3927       base_offset = ssize_int (0);
3928       init = ssize_int (0);
3929       base_name = get_name (DR_REF (dr));
3930     }
3931
3932   /* Create base_offset */
3933   base_offset = size_binop (PLUS_EXPR,
3934                             fold_convert (sizetype, base_offset),
3935                             fold_convert (sizetype, init));
3936
3937   if (offset)
3938     {
3939       offset = fold_build2 (MULT_EXPR, sizetype,
3940                             fold_convert (sizetype, offset), step);
3941       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3942                                  base_offset, offset);
3943     }
3944   if (byte_offset)
3945     {
3946       byte_offset = fold_convert (sizetype, byte_offset);
3947       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3948                                  base_offset, byte_offset);
3949     }
3950
3951   /* base + base_offset */
3952   if (loop_vinfo)
3953     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3954   else
3955     {
3956       addr_base = build1 (ADDR_EXPR,
3957                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3958                           unshare_expr (DR_REF (dr)));
3959     }
3960
3961   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3962   addr_base = fold_convert (vect_ptr_type, addr_base);
3963   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3964   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3965   gimple_seq_add_seq (new_stmt_list, seq);
3966
3967   if (DR_PTR_INFO (dr)
3968       && TREE_CODE (addr_base) == SSA_NAME)
3969     {
3970       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3971       if (offset || byte_offset)
3972         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3973     }
3974
3975   if (dump_enabled_p ())
3976     {
3977       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3978       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3979       dump_printf (MSG_NOTE, "\n");
3980     }
3981
3982   return addr_base;
3983 }
3984
3985
3986 /* Function vect_create_data_ref_ptr.
3987
3988    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3989    location accessed in the loop by STMT, along with the def-use update
3990    chain to appropriately advance the pointer through the loop iterations.
3991    Also set aliasing information for the pointer.  This pointer is used by
3992    the callers to this function to create a memory reference expression for
3993    vector load/store access.
3994
3995    Input:
3996    1. STMT: a stmt that references memory. Expected to be of the form
3997          GIMPLE_ASSIGN <name, data-ref> or
3998          GIMPLE_ASSIGN <data-ref, name>.
3999    2. AGGR_TYPE: the type of the reference, which should be either a vector
4000         or an array.
4001    3. AT_LOOP: the loop where the vector memref is to be created.
4002    4. OFFSET (optional): an offset to be added to the initial address accessed
4003         by the data-ref in STMT.
4004    5. BSI: location where the new stmts are to be placed if there is no loop
4005    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4006         pointing to the initial address.
4007    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4008         to the initial address accessed by the data-ref in STMT.  This is
4009         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4010         in bytes.
4011
4012    Output:
4013    1. Declare a new ptr to vector_type, and have it point to the base of the
4014       data reference (initial addressed accessed by the data reference).
4015       For example, for vector of type V8HI, the following code is generated:
4016
4017       v8hi *ap;
4018       ap = (v8hi *)initial_address;
4019
4020       if OFFSET is not supplied:
4021          initial_address = &a[init];
4022       if OFFSET is supplied:
4023          initial_address = &a[init + OFFSET];
4024       if BYTE_OFFSET is supplied:
4025          initial_address = &a[init] + BYTE_OFFSET;
4026
4027       Return the initial_address in INITIAL_ADDRESS.
4028
4029    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4030       update the pointer in each iteration of the loop.
4031
4032       Return the increment stmt that updates the pointer in PTR_INCR.
4033
4034    3. Set INV_P to true if the access pattern of the data reference in the
4035       vectorized loop is invariant.  Set it to false otherwise.
4036
4037    4. Return the pointer.  */
4038
4039 tree
4040 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4041                           tree offset, tree *initial_address,
4042                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4043                           bool only_init, bool *inv_p, tree byte_offset)
4044 {
4045   const char *base_name;
4046   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4047   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4048   struct loop *loop = NULL;
4049   bool nested_in_vect_loop = false;
4050   struct loop *containing_loop = NULL;
4051   tree aggr_ptr_type;
4052   tree aggr_ptr;
4053   tree new_temp;
4054   gimple vec_stmt;
4055   gimple_seq new_stmt_list = NULL;
4056   edge pe = NULL;
4057   basic_block new_bb;
4058   tree aggr_ptr_init;
4059   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4060   tree aptr;
4061   gimple_stmt_iterator incr_gsi;
4062   bool insert_after;
4063   tree indx_before_incr, indx_after_incr;
4064   gimple incr;
4065   tree step;
4066   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4067
4068   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4069               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4070
4071   if (loop_vinfo)
4072     {
4073       loop = LOOP_VINFO_LOOP (loop_vinfo);
4074       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4075       containing_loop = (gimple_bb (stmt))->loop_father;
4076       pe = loop_preheader_edge (loop);
4077     }
4078   else
4079     {
4080       gcc_assert (bb_vinfo);
4081       only_init = true;
4082       *ptr_incr = NULL;
4083     }
4084
4085   /* Check the step (evolution) of the load in LOOP, and record
4086      whether it's invariant.  */
4087   if (nested_in_vect_loop)
4088     step = STMT_VINFO_DR_STEP (stmt_info);
4089   else
4090     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4091
4092   if (integer_zerop (step))
4093     *inv_p = true;
4094   else
4095     *inv_p = false;
4096
4097   /* Create an expression for the first address accessed by this load
4098      in LOOP.  */
4099   base_name = get_name (DR_BASE_ADDRESS (dr));
4100
4101   if (dump_enabled_p ())
4102     {
4103       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4104       dump_printf_loc (MSG_NOTE, vect_location,
4105                        "create %s-pointer variable to type: ",
4106                        get_tree_code_name (TREE_CODE (aggr_type)));
4107       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4108       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4109         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4110       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4111         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4112       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4113         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4114       else
4115         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4116       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4117       dump_printf (MSG_NOTE, "\n");
4118     }
4119
4120   /* (1) Create the new aggregate-pointer variable.
4121      Vector and array types inherit the alias set of their component
4122      type by default so we need to use a ref-all pointer if the data
4123      reference does not conflict with the created aggregated data
4124      reference because it is not addressable.  */
4125   bool need_ref_all = false;
4126   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4127                               get_alias_set (DR_REF (dr))))
4128     need_ref_all = true;
4129   /* Likewise for any of the data references in the stmt group.  */
4130   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4131     {
4132       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4133       do
4134         {
4135           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4136           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4137           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4138                                       get_alias_set (DR_REF (sdr))))
4139             {
4140               need_ref_all = true;
4141               break;
4142             }
4143           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4144         }
4145       while (orig_stmt);
4146     }
4147   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4148                                                need_ref_all);
4149   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4150
4151
4152   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4153      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4154      def-use update cycles for the pointer: one relative to the outer-loop
4155      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4156      to the inner-loop (which is the inner-most loop containing the dataref),
4157      and this is done be step (5) below.
4158
4159      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4160      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4161      redundant.  Steps (3),(4) create the following:
4162
4163         vp0 = &base_addr;
4164         LOOP:   vp1 = phi(vp0,vp2)
4165                 ...
4166                 ...
4167                 vp2 = vp1 + step
4168                 goto LOOP
4169
4170      If there is an inner-loop nested in loop, then step (5) will also be
4171      applied, and an additional update in the inner-loop will be created:
4172
4173         vp0 = &base_addr;
4174         LOOP:   vp1 = phi(vp0,vp2)
4175                 ...
4176         inner:     vp3 = phi(vp1,vp4)
4177                    vp4 = vp3 + inner_step
4178                    if () goto inner
4179                 ...
4180                 vp2 = vp1 + step
4181                 if () goto LOOP   */
4182
4183   /* (2) Calculate the initial address of the aggregate-pointer, and set
4184      the aggregate-pointer to point to it before the loop.  */
4185
4186   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4187
4188   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4189                                                    offset, loop, byte_offset);
4190   if (new_stmt_list)
4191     {
4192       if (pe)
4193         {
4194           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4195           gcc_assert (!new_bb);
4196         }
4197       else
4198         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4199     }
4200
4201   *initial_address = new_temp;
4202
4203   /* Create: p = (aggr_type *) initial_base  */
4204   if (TREE_CODE (new_temp) != SSA_NAME
4205       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4206     {
4207       vec_stmt = gimple_build_assign (aggr_ptr,
4208                                       fold_convert (aggr_ptr_type, new_temp));
4209       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4210       /* Copy the points-to information if it exists. */
4211       if (DR_PTR_INFO (dr))
4212         vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4213       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4214       if (pe)
4215         {
4216           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4217           gcc_assert (!new_bb);
4218         }
4219       else
4220         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4221     }
4222   else
4223     aggr_ptr_init = new_temp;
4224
4225   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4226      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4227      inner-loop nested in LOOP (during outer-loop vectorization).  */
4228
4229   /* No update in loop is required.  */
4230   if (only_init && (!loop_vinfo || at_loop == loop))
4231     aptr = aggr_ptr_init;
4232   else
4233     {
4234       /* The step of the aggregate pointer is the type size.  */
4235       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4236       /* One exception to the above is when the scalar step of the load in
4237          LOOP is zero. In this case the step here is also zero.  */
4238       if (*inv_p)
4239         iv_step = size_zero_node;
4240       else if (tree_int_cst_sgn (step) == -1)
4241         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4242
4243       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4244
4245       create_iv (aggr_ptr_init,
4246                  fold_convert (aggr_ptr_type, iv_step),
4247                  aggr_ptr, loop, &incr_gsi, insert_after,
4248                  &indx_before_incr, &indx_after_incr);
4249       incr = gsi_stmt (incr_gsi);
4250       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4251
4252       /* Copy the points-to information if it exists. */
4253       if (DR_PTR_INFO (dr))
4254         {
4255           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4256           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4257         }
4258       if (ptr_incr)
4259         *ptr_incr = incr;
4260
4261       aptr = indx_before_incr;
4262     }
4263
4264   if (!nested_in_vect_loop || only_init)
4265     return aptr;
4266
4267
4268   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4269      nested in LOOP, if exists.  */
4270
4271   gcc_assert (nested_in_vect_loop);
4272   if (!only_init)
4273     {
4274       standard_iv_increment_position (containing_loop, &incr_gsi,
4275                                       &insert_after);
4276       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4277                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4278                  &indx_after_incr);
4279       incr = gsi_stmt (incr_gsi);
4280       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4281
4282       /* Copy the points-to information if it exists. */
4283       if (DR_PTR_INFO (dr))
4284         {
4285           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4286           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4287         }
4288       if (ptr_incr)
4289         *ptr_incr = incr;
4290
4291       return indx_before_incr;
4292     }
4293   else
4294     gcc_unreachable ();
4295 }
4296
4297
4298 /* Function bump_vector_ptr
4299
4300    Increment a pointer (to a vector type) by vector-size. If requested,
4301    i.e. if PTR-INCR is given, then also connect the new increment stmt
4302    to the existing def-use update-chain of the pointer, by modifying
4303    the PTR_INCR as illustrated below:
4304
4305    The pointer def-use update-chain before this function:
4306                         DATAREF_PTR = phi (p_0, p_2)
4307                         ....
4308         PTR_INCR:       p_2 = DATAREF_PTR + step
4309
4310    The pointer def-use update-chain after this function:
4311                         DATAREF_PTR = phi (p_0, p_2)
4312                         ....
4313                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4314                         ....
4315         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4316
4317    Input:
4318    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4319                  in the loop.
4320    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4321               the loop.  The increment amount across iterations is expected
4322               to be vector_size.
4323    BSI - location where the new update stmt is to be placed.
4324    STMT - the original scalar memory-access stmt that is being vectorized.
4325    BUMP - optional. The offset by which to bump the pointer. If not given,
4326           the offset is assumed to be vector_size.
4327
4328    Output: Return NEW_DATAREF_PTR as illustrated above.
4329
4330 */
4331
4332 tree
4333 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4334                  gimple stmt, tree bump)
4335 {
4336   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4337   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4338   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4339   tree update = TYPE_SIZE_UNIT (vectype);
4340   gassign *incr_stmt;
4341   ssa_op_iter iter;
4342   use_operand_p use_p;
4343   tree new_dataref_ptr;
4344
4345   if (bump)
4346     update = bump;
4347
4348   new_dataref_ptr = copy_ssa_name (dataref_ptr);
4349   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4350                                    dataref_ptr, update);
4351   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4352
4353   /* Copy the points-to information if it exists. */
4354   if (DR_PTR_INFO (dr))
4355     {
4356       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4357       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4358     }
4359
4360   if (!ptr_incr)
4361     return new_dataref_ptr;
4362
4363   /* Update the vector-pointer's cross-iteration increment.  */
4364   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4365     {
4366       tree use = USE_FROM_PTR (use_p);
4367
4368       if (use == dataref_ptr)
4369         SET_USE (use_p, new_dataref_ptr);
4370       else
4371         gcc_assert (tree_int_cst_compare (use, update) == 0);
4372     }
4373
4374   return new_dataref_ptr;
4375 }
4376
4377
4378 /* Function vect_create_destination_var.
4379
4380    Create a new temporary of type VECTYPE.  */
4381
4382 tree
4383 vect_create_destination_var (tree scalar_dest, tree vectype)
4384 {
4385   tree vec_dest;
4386   const char *name;
4387   char *new_name;
4388   tree type;
4389   enum vect_var_kind kind;
4390
4391   kind = vectype ? vect_simple_var : vect_scalar_var;
4392   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4393
4394   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4395
4396   name = get_name (scalar_dest);
4397   if (name)
4398     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4399   else
4400     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4401   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4402   free (new_name);
4403
4404   return vec_dest;
4405 }
4406
4407 /* Function vect_grouped_store_supported.
4408
4409    Returns TRUE if interleave high and interleave low permutations
4410    are supported, and FALSE otherwise.  */
4411
4412 bool
4413 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4414 {
4415   machine_mode mode = TYPE_MODE (vectype);
4416
4417   /* vect_permute_store_chain requires the group size to be equal to 3 or
4418      be a power of two.  */
4419   if (count != 3 && exact_log2 (count) == -1)
4420     {
4421       if (dump_enabled_p ())
4422         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4423                          "the size of the group of accesses"
4424                          " is not a power of 2 or not eqaul to 3\n");
4425       return false;
4426     }
4427
4428   /* Check that the permutation is supported.  */
4429   if (VECTOR_MODE_P (mode))
4430     {
4431       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4432       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4433
4434       if (count == 3)
4435         {
4436           unsigned int j0 = 0, j1 = 0, j2 = 0;
4437           unsigned int i, j;
4438
4439           for (j = 0; j < 3; j++)
4440             {
4441               int nelt0 = ((3 - j) * nelt) % 3;
4442               int nelt1 = ((3 - j) * nelt + 1) % 3;
4443               int nelt2 = ((3 - j) * nelt + 2) % 3;
4444               for (i = 0; i < nelt; i++)
4445                 {
4446                   if (3 * i + nelt0 < nelt)
4447                     sel[3 * i + nelt0] = j0++;
4448                   if (3 * i + nelt1 < nelt)
4449                     sel[3 * i + nelt1] = nelt + j1++;
4450                   if (3 * i + nelt2 < nelt)
4451                     sel[3 * i + nelt2] = 0;
4452                 }
4453               if (!can_vec_perm_p (mode, false, sel))
4454                 {
4455                   if (dump_enabled_p ())
4456                     dump_printf (MSG_MISSED_OPTIMIZATION,
4457                                  "permutaion op not supported by target.\n");
4458                   return false;
4459                 }
4460
4461               for (i = 0; i < nelt; i++)
4462                 {
4463                   if (3 * i + nelt0 < nelt)
4464                     sel[3 * i + nelt0] = 3 * i + nelt0;
4465                   if (3 * i + nelt1 < nelt)
4466                     sel[3 * i + nelt1] = 3 * i + nelt1;
4467                   if (3 * i + nelt2 < nelt)
4468                     sel[3 * i + nelt2] = nelt + j2++;
4469                 }
4470               if (!can_vec_perm_p (mode, false, sel))
4471                 {
4472                   if (dump_enabled_p ())
4473                     dump_printf (MSG_MISSED_OPTIMIZATION,
4474                                  "permutaion op not supported by target.\n");
4475                   return false;
4476                 }
4477             }
4478           return true;
4479         }
4480       else
4481         {
4482           /* If length is not equal to 3 then only power of 2 is supported.  */
4483           gcc_assert (exact_log2 (count) != -1);
4484
4485           for (i = 0; i < nelt / 2; i++)
4486             {
4487               sel[i * 2] = i;
4488               sel[i * 2 + 1] = i + nelt;
4489             }
4490             if (can_vec_perm_p (mode, false, sel))
4491               {
4492                 for (i = 0; i < nelt; i++)
4493                   sel[i] += nelt / 2;
4494                 if (can_vec_perm_p (mode, false, sel))
4495                   return true;
4496               }
4497         }
4498     }
4499
4500   if (dump_enabled_p ())
4501     dump_printf (MSG_MISSED_OPTIMIZATION,
4502                  "permutaion op not supported by target.\n");
4503   return false;
4504 }
4505
4506
4507 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4508    type VECTYPE.  */
4509
4510 bool
4511 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4512 {
4513   return vect_lanes_optab_supported_p ("vec_store_lanes",
4514                                        vec_store_lanes_optab,
4515                                        vectype, count);
4516 }
4517
4518
4519 /* Function vect_permute_store_chain.
4520
4521    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4522    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4523    the data correctly for the stores.  Return the final references for stores
4524    in RESULT_CHAIN.
4525
4526    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4527    The input is 4 vectors each containing 8 elements.  We assign a number to
4528    each element, the input sequence is:
4529
4530    1st vec:   0  1  2  3  4  5  6  7
4531    2nd vec:   8  9 10 11 12 13 14 15
4532    3rd vec:  16 17 18 19 20 21 22 23
4533    4th vec:  24 25 26 27 28 29 30 31
4534
4535    The output sequence should be:
4536
4537    1st vec:  0  8 16 24  1  9 17 25
4538    2nd vec:  2 10 18 26  3 11 19 27
4539    3rd vec:  4 12 20 28  5 13 21 30
4540    4th vec:  6 14 22 30  7 15 23 31
4541
4542    i.e., we interleave the contents of the four vectors in their order.
4543
4544    We use interleave_high/low instructions to create such output.  The input of
4545    each interleave_high/low operation is two vectors:
4546    1st vec    2nd vec
4547    0 1 2 3    4 5 6 7
4548    the even elements of the result vector are obtained left-to-right from the
4549    high/low elements of the first vector.  The odd elements of the result are
4550    obtained left-to-right from the high/low elements of the second vector.
4551    The output of interleave_high will be:   0 4 1 5
4552    and of interleave_low:                   2 6 3 7
4553
4554
4555    The permutation is done in log LENGTH stages.  In each stage interleave_high
4556    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4557    where the first argument is taken from the first half of DR_CHAIN and the
4558    second argument from it's second half.
4559    In our example,
4560
4561    I1: interleave_high (1st vec, 3rd vec)
4562    I2: interleave_low (1st vec, 3rd vec)
4563    I3: interleave_high (2nd vec, 4th vec)
4564    I4: interleave_low (2nd vec, 4th vec)
4565
4566    The output for the first stage is:
4567
4568    I1:  0 16  1 17  2 18  3 19
4569    I2:  4 20  5 21  6 22  7 23
4570    I3:  8 24  9 25 10 26 11 27
4571    I4: 12 28 13 29 14 30 15 31
4572
4573    The output of the second stage, i.e. the final result is:
4574
4575    I1:  0  8 16 24  1  9 17 25
4576    I2:  2 10 18 26  3 11 19 27
4577    I3:  4 12 20 28  5 13 21 30
4578    I4:  6 14 22 30  7 15 23 31.  */
4579
4580 void
4581 vect_permute_store_chain (vec<tree> dr_chain,
4582                           unsigned int length,
4583                           gimple stmt,
4584                           gimple_stmt_iterator *gsi,
4585                           vec<tree> *result_chain)
4586 {
4587   tree vect1, vect2, high, low;
4588   gimple perm_stmt;
4589   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4590   tree perm_mask_low, perm_mask_high;
4591   tree data_ref;
4592   tree perm3_mask_low, perm3_mask_high;
4593   unsigned int i, n, log_length = exact_log2 (length);
4594   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4595   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4596
4597   result_chain->quick_grow (length);
4598   memcpy (result_chain->address (), dr_chain.address (),
4599           length * sizeof (tree));
4600
4601   if (length == 3)
4602     {
4603       unsigned int j0 = 0, j1 = 0, j2 = 0;
4604
4605       for (j = 0; j < 3; j++)
4606         {
4607           int nelt0 = ((3 - j) * nelt) % 3;
4608           int nelt1 = ((3 - j) * nelt + 1) % 3;
4609           int nelt2 = ((3 - j) * nelt + 2) % 3;
4610
4611           for (i = 0; i < nelt; i++)
4612             {
4613               if (3 * i + nelt0 < nelt)
4614                 sel[3 * i + nelt0] = j0++;
4615               if (3 * i + nelt1 < nelt)
4616                 sel[3 * i + nelt1] = nelt + j1++;
4617               if (3 * i + nelt2 < nelt)
4618                 sel[3 * i + nelt2] = 0;
4619             }
4620           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4621
4622           for (i = 0; i < nelt; i++)
4623             {
4624               if (3 * i + nelt0 < nelt)
4625                 sel[3 * i + nelt0] = 3 * i + nelt0;
4626               if (3 * i + nelt1 < nelt)
4627                 sel[3 * i + nelt1] = 3 * i + nelt1;
4628               if (3 * i + nelt2 < nelt)
4629                 sel[3 * i + nelt2] = nelt + j2++;
4630             }
4631           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4632
4633           vect1 = dr_chain[0];
4634           vect2 = dr_chain[1];
4635
4636           /* Create interleaving stmt:
4637              low = VEC_PERM_EXPR <vect1, vect2,
4638                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4639                                    j + 2, nelt + j + 2, *, ...}>  */
4640           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4641           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4642                                            vect2, perm3_mask_low);
4643           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4644
4645           vect1 = data_ref;
4646           vect2 = dr_chain[2];
4647           /* Create interleaving stmt:
4648              low = VEC_PERM_EXPR <vect1, vect2,
4649                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4650                                    6, 7, nelt + j + 2, ...}>  */
4651           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4652           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4653                                            vect2, perm3_mask_high);
4654           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4655           (*result_chain)[j] = data_ref;
4656         }
4657     }
4658   else
4659     {
4660       /* If length is not equal to 3 then only power of 2 is supported.  */
4661       gcc_assert (exact_log2 (length) != -1);
4662
4663       for (i = 0, n = nelt / 2; i < n; i++)
4664         {
4665           sel[i * 2] = i;
4666           sel[i * 2 + 1] = i + nelt;
4667         }
4668         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4669
4670         for (i = 0; i < nelt; i++)
4671           sel[i] += nelt / 2;
4672         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4673
4674         for (i = 0, n = log_length; i < n; i++)
4675           {
4676             for (j = 0; j < length/2; j++)
4677               {
4678                 vect1 = dr_chain[j];
4679                 vect2 = dr_chain[j+length/2];
4680
4681                 /* Create interleaving stmt:
4682                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4683                                                         ...}>  */
4684                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4685                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4686                                                  vect2, perm_mask_high);
4687                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4688                 (*result_chain)[2*j] = high;
4689
4690                 /* Create interleaving stmt:
4691                    low = VEC_PERM_EXPR <vect1, vect2,
4692                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4693                                          ...}>  */
4694                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4695                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4696                                                  vect2, perm_mask_low);
4697                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4698                 (*result_chain)[2*j+1] = low;
4699               }
4700             memcpy (dr_chain.address (), result_chain->address (),
4701                     length * sizeof (tree));
4702           }
4703     }
4704 }
4705
4706 /* Function vect_setup_realignment
4707
4708    This function is called when vectorizing an unaligned load using
4709    the dr_explicit_realign[_optimized] scheme.
4710    This function generates the following code at the loop prolog:
4711
4712       p = initial_addr;
4713    x  msq_init = *(floor(p));   # prolog load
4714       realignment_token = call target_builtin;
4715     loop:
4716    x  msq = phi (msq_init, ---)
4717
4718    The stmts marked with x are generated only for the case of
4719    dr_explicit_realign_optimized.
4720
4721    The code above sets up a new (vector) pointer, pointing to the first
4722    location accessed by STMT, and a "floor-aligned" load using that pointer.
4723    It also generates code to compute the "realignment-token" (if the relevant
4724    target hook was defined), and creates a phi-node at the loop-header bb
4725    whose arguments are the result of the prolog-load (created by this
4726    function) and the result of a load that takes place in the loop (to be
4727    created by the caller to this function).
4728
4729    For the case of dr_explicit_realign_optimized:
4730    The caller to this function uses the phi-result (msq) to create the
4731    realignment code inside the loop, and sets up the missing phi argument,
4732    as follows:
4733     loop:
4734       msq = phi (msq_init, lsq)
4735       lsq = *(floor(p'));        # load in loop
4736       result = realign_load (msq, lsq, realignment_token);
4737
4738    For the case of dr_explicit_realign:
4739     loop:
4740       msq = *(floor(p));        # load in loop
4741       p' = p + (VS-1);
4742       lsq = *(floor(p'));       # load in loop
4743       result = realign_load (msq, lsq, realignment_token);
4744
4745    Input:
4746    STMT - (scalar) load stmt to be vectorized. This load accesses
4747           a memory location that may be unaligned.
4748    BSI - place where new code is to be inserted.
4749    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4750                               is used.
4751
4752    Output:
4753    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4754                        target hook, if defined.
4755    Return value - the result of the loop-header phi node.  */
4756
4757 tree
4758 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4759                         tree *realignment_token,
4760                         enum dr_alignment_support alignment_support_scheme,
4761                         tree init_addr,
4762                         struct loop **at_loop)
4763 {
4764   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4765   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4766   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4767   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4768   struct loop *loop = NULL;
4769   edge pe = NULL;
4770   tree scalar_dest = gimple_assign_lhs (stmt);
4771   tree vec_dest;
4772   gimple inc;
4773   tree ptr;
4774   tree data_ref;
4775   basic_block new_bb;
4776   tree msq_init = NULL_TREE;
4777   tree new_temp;
4778   gphi *phi_stmt;
4779   tree msq = NULL_TREE;
4780   gimple_seq stmts = NULL;
4781   bool inv_p;
4782   bool compute_in_loop = false;
4783   bool nested_in_vect_loop = false;
4784   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4785   struct loop *loop_for_initial_load = NULL;
4786
4787   if (loop_vinfo)
4788     {
4789       loop = LOOP_VINFO_LOOP (loop_vinfo);
4790       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4791     }
4792
4793   gcc_assert (alignment_support_scheme == dr_explicit_realign
4794               || alignment_support_scheme == dr_explicit_realign_optimized);
4795
4796   /* We need to generate three things:
4797      1. the misalignment computation
4798      2. the extra vector load (for the optimized realignment scheme).
4799      3. the phi node for the two vectors from which the realignment is
4800       done (for the optimized realignment scheme).  */
4801
4802   /* 1. Determine where to generate the misalignment computation.
4803
4804      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4805      calculation will be generated by this function, outside the loop (in the
4806      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4807      caller, inside the loop.
4808
4809      Background: If the misalignment remains fixed throughout the iterations of
4810      the loop, then both realignment schemes are applicable, and also the
4811      misalignment computation can be done outside LOOP.  This is because we are
4812      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4813      are a multiple of VS (the Vector Size), and therefore the misalignment in
4814      different vectorized LOOP iterations is always the same.
4815      The problem arises only if the memory access is in an inner-loop nested
4816      inside LOOP, which is now being vectorized using outer-loop vectorization.
4817      This is the only case when the misalignment of the memory access may not
4818      remain fixed throughout the iterations of the inner-loop (as explained in
4819      detail in vect_supportable_dr_alignment).  In this case, not only is the
4820      optimized realignment scheme not applicable, but also the misalignment
4821      computation (and generation of the realignment token that is passed to
4822      REALIGN_LOAD) have to be done inside the loop.
4823
4824      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4825      or not, which in turn determines if the misalignment is computed inside
4826      the inner-loop, or outside LOOP.  */
4827
4828   if (init_addr != NULL_TREE || !loop_vinfo)
4829     {
4830       compute_in_loop = true;
4831       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4832     }
4833
4834
4835   /* 2. Determine where to generate the extra vector load.
4836
4837      For the optimized realignment scheme, instead of generating two vector
4838      loads in each iteration, we generate a single extra vector load in the
4839      preheader of the loop, and in each iteration reuse the result of the
4840      vector load from the previous iteration.  In case the memory access is in
4841      an inner-loop nested inside LOOP, which is now being vectorized using
4842      outer-loop vectorization, we need to determine whether this initial vector
4843      load should be generated at the preheader of the inner-loop, or can be
4844      generated at the preheader of LOOP.  If the memory access has no evolution
4845      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4846      to be generated inside LOOP (in the preheader of the inner-loop).  */
4847
4848   if (nested_in_vect_loop)
4849     {
4850       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4851       bool invariant_in_outerloop =
4852             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4853       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4854     }
4855   else
4856     loop_for_initial_load = loop;
4857   if (at_loop)
4858     *at_loop = loop_for_initial_load;
4859
4860   if (loop_for_initial_load)
4861     pe = loop_preheader_edge (loop_for_initial_load);
4862
4863   /* 3. For the case of the optimized realignment, create the first vector
4864       load at the loop preheader.  */
4865
4866   if (alignment_support_scheme == dr_explicit_realign_optimized)
4867     {
4868       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4869       gassign *new_stmt;
4870
4871       gcc_assert (!compute_in_loop);
4872       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4873       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4874                                       NULL_TREE, &init_addr, NULL, &inc,
4875                                       true, &inv_p);
4876       new_temp = copy_ssa_name (ptr);
4877       new_stmt = gimple_build_assign
4878                    (new_temp, BIT_AND_EXPR, ptr,
4879                     build_int_cst (TREE_TYPE (ptr),
4880                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4881       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4882       gcc_assert (!new_bb);
4883       data_ref
4884         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4885                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4886       new_stmt = gimple_build_assign (vec_dest, data_ref);
4887       new_temp = make_ssa_name (vec_dest, new_stmt);
4888       gimple_assign_set_lhs (new_stmt, new_temp);
4889       if (pe)
4890         {
4891           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4892           gcc_assert (!new_bb);
4893         }
4894       else
4895          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4896
4897       msq_init = gimple_assign_lhs (new_stmt);
4898     }
4899
4900   /* 4. Create realignment token using a target builtin, if available.
4901       It is done either inside the containing loop, or before LOOP (as
4902       determined above).  */
4903
4904   if (targetm.vectorize.builtin_mask_for_load)
4905     {
4906       gcall *new_stmt;
4907       tree builtin_decl;
4908
4909       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4910       if (!init_addr)
4911         {
4912           /* Generate the INIT_ADDR computation outside LOOP.  */
4913           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4914                                                         NULL_TREE, loop);
4915           if (loop)
4916             {
4917               pe = loop_preheader_edge (loop);
4918               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4919               gcc_assert (!new_bb);
4920             }
4921           else
4922              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4923         }
4924
4925       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4926       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4927       vec_dest =
4928         vect_create_destination_var (scalar_dest,
4929                                      gimple_call_return_type (new_stmt));
4930       new_temp = make_ssa_name (vec_dest, new_stmt);
4931       gimple_call_set_lhs (new_stmt, new_temp);
4932
4933       if (compute_in_loop)
4934         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4935       else
4936         {
4937           /* Generate the misalignment computation outside LOOP.  */
4938           pe = loop_preheader_edge (loop);
4939           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4940           gcc_assert (!new_bb);
4941         }
4942
4943       *realignment_token = gimple_call_lhs (new_stmt);
4944
4945       /* The result of the CALL_EXPR to this builtin is determined from
4946          the value of the parameter and no global variables are touched
4947          which makes the builtin a "const" function.  Requiring the
4948          builtin to have the "const" attribute makes it unnecessary
4949          to call mark_call_clobbered.  */
4950       gcc_assert (TREE_READONLY (builtin_decl));
4951     }
4952
4953   if (alignment_support_scheme == dr_explicit_realign)
4954     return msq;
4955
4956   gcc_assert (!compute_in_loop);
4957   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4958
4959
4960   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4961
4962   pe = loop_preheader_edge (containing_loop);
4963   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4964   msq = make_ssa_name (vec_dest);
4965   phi_stmt = create_phi_node (msq, containing_loop->header);
4966   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4967
4968   return msq;
4969 }
4970
4971
4972 /* Function vect_grouped_load_supported.
4973
4974    Returns TRUE if even and odd permutations are supported,
4975    and FALSE otherwise.  */
4976
4977 bool
4978 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4979 {
4980   machine_mode mode = TYPE_MODE (vectype);
4981
4982   /* vect_permute_load_chain requires the group size to be equal to 3 or
4983      be a power of two.  */
4984   if (count != 3 && exact_log2 (count) == -1)
4985     {
4986       if (dump_enabled_p ())
4987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4988                          "the size of the group of accesses"
4989                          " is not a power of 2 or not equal to 3\n");
4990       return false;
4991     }
4992
4993   /* Check that the permutation is supported.  */
4994   if (VECTOR_MODE_P (mode))
4995     {
4996       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4997       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4998
4999       if (count == 3)
5000         {
5001           unsigned int k;
5002           for (k = 0; k < 3; k++)
5003             {
5004               for (i = 0; i < nelt; i++)
5005                 if (3 * i + k < 2 * nelt)
5006                   sel[i] = 3 * i + k;
5007                 else
5008                   sel[i] = 0;
5009               if (!can_vec_perm_p (mode, false, sel))
5010                 {
5011                   if (dump_enabled_p ())
5012                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5013                                      "shuffle of 3 loads is not supported by"
5014                                      " target\n");
5015                   return false;
5016                 }
5017               for (i = 0, j = 0; i < nelt; i++)
5018                 if (3 * i + k < 2 * nelt)
5019                   sel[i] = i;
5020                 else
5021                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5022               if (!can_vec_perm_p (mode, false, sel))
5023                 {
5024                   if (dump_enabled_p ())
5025                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5026                                      "shuffle of 3 loads is not supported by"
5027                                      " target\n");
5028                   return false;
5029                 }
5030             }
5031           return true;
5032         }
5033       else
5034         {
5035           /* If length is not equal to 3 then only power of 2 is supported.  */
5036           gcc_assert (exact_log2 (count) != -1);
5037           for (i = 0; i < nelt; i++)
5038             sel[i] = i * 2;
5039           if (can_vec_perm_p (mode, false, sel))
5040             {
5041               for (i = 0; i < nelt; i++)
5042                 sel[i] = i * 2 + 1;
5043               if (can_vec_perm_p (mode, false, sel))
5044                 return true;
5045             }
5046         }
5047     }
5048
5049   if (dump_enabled_p ())
5050     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5051                      "extract even/odd not supported by target\n");
5052   return false;
5053 }
5054
5055 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5056    type VECTYPE.  */
5057
5058 bool
5059 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5060 {
5061   return vect_lanes_optab_supported_p ("vec_load_lanes",
5062                                        vec_load_lanes_optab,
5063                                        vectype, count);
5064 }
5065
5066 /* Function vect_permute_load_chain.
5067
5068    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5069    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5070    the input data correctly.  Return the final references for loads in
5071    RESULT_CHAIN.
5072
5073    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5074    The input is 4 vectors each containing 8 elements. We assign a number to each
5075    element, the input sequence is:
5076
5077    1st vec:   0  1  2  3  4  5  6  7
5078    2nd vec:   8  9 10 11 12 13 14 15
5079    3rd vec:  16 17 18 19 20 21 22 23
5080    4th vec:  24 25 26 27 28 29 30 31
5081
5082    The output sequence should be:
5083
5084    1st vec:  0 4  8 12 16 20 24 28
5085    2nd vec:  1 5  9 13 17 21 25 29
5086    3rd vec:  2 6 10 14 18 22 26 30
5087    4th vec:  3 7 11 15 19 23 27 31
5088
5089    i.e., the first output vector should contain the first elements of each
5090    interleaving group, etc.
5091
5092    We use extract_even/odd instructions to create such output.  The input of
5093    each extract_even/odd operation is two vectors
5094    1st vec    2nd vec
5095    0 1 2 3    4 5 6 7
5096
5097    and the output is the vector of extracted even/odd elements.  The output of
5098    extract_even will be:   0 2 4 6
5099    and of extract_odd:     1 3 5 7
5100
5101
5102    The permutation is done in log LENGTH stages.  In each stage extract_even
5103    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5104    their order.  In our example,
5105
5106    E1: extract_even (1st vec, 2nd vec)
5107    E2: extract_odd (1st vec, 2nd vec)
5108    E3: extract_even (3rd vec, 4th vec)
5109    E4: extract_odd (3rd vec, 4th vec)
5110
5111    The output for the first stage will be:
5112
5113    E1:  0  2  4  6  8 10 12 14
5114    E2:  1  3  5  7  9 11 13 15
5115    E3: 16 18 20 22 24 26 28 30
5116    E4: 17 19 21 23 25 27 29 31
5117
5118    In order to proceed and create the correct sequence for the next stage (or
5119    for the correct output, if the second stage is the last one, as in our
5120    example), we first put the output of extract_even operation and then the
5121    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5122    The input for the second stage is:
5123
5124    1st vec (E1):  0  2  4  6  8 10 12 14
5125    2nd vec (E3): 16 18 20 22 24 26 28 30
5126    3rd vec (E2):  1  3  5  7  9 11 13 15
5127    4th vec (E4): 17 19 21 23 25 27 29 31
5128
5129    The output of the second stage:
5130
5131    E1: 0 4  8 12 16 20 24 28
5132    E2: 2 6 10 14 18 22 26 30
5133    E3: 1 5  9 13 17 21 25 29
5134    E4: 3 7 11 15 19 23 27 31
5135
5136    And RESULT_CHAIN after reordering:
5137
5138    1st vec (E1):  0 4  8 12 16 20 24 28
5139    2nd vec (E3):  1 5  9 13 17 21 25 29
5140    3rd vec (E2):  2 6 10 14 18 22 26 30
5141    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5142
5143 static void
5144 vect_permute_load_chain (vec<tree> dr_chain,
5145                          unsigned int length,
5146                          gimple stmt,
5147                          gimple_stmt_iterator *gsi,
5148                          vec<tree> *result_chain)
5149 {
5150   tree data_ref, first_vect, second_vect;
5151   tree perm_mask_even, perm_mask_odd;
5152   tree perm3_mask_low, perm3_mask_high;
5153   gimple perm_stmt;
5154   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5155   unsigned int i, j, log_length = exact_log2 (length);
5156   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5157   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5158
5159   result_chain->quick_grow (length);
5160   memcpy (result_chain->address (), dr_chain.address (),
5161           length * sizeof (tree));
5162
5163   if (length == 3)
5164     {
5165       unsigned int k;
5166
5167       for (k = 0; k < 3; k++)
5168         {
5169           for (i = 0; i < nelt; i++)
5170             if (3 * i + k < 2 * nelt)
5171               sel[i] = 3 * i + k;
5172             else
5173               sel[i] = 0;
5174           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5175
5176           for (i = 0, j = 0; i < nelt; i++)
5177             if (3 * i + k < 2 * nelt)
5178               sel[i] = i;
5179             else
5180               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5181
5182           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5183
5184           first_vect = dr_chain[0];
5185           second_vect = dr_chain[1];
5186
5187           /* Create interleaving stmt (low part of):
5188              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5189                                                              ...}>  */
5190           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5191           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5192                                            second_vect, perm3_mask_low);
5193           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5194
5195           /* Create interleaving stmt (high part of):
5196              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5197                                                               ...}>  */
5198           first_vect = data_ref;
5199           second_vect = dr_chain[2];
5200           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5201           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5202                                            second_vect, perm3_mask_high);
5203           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5204           (*result_chain)[k] = data_ref;
5205         }
5206     }
5207   else
5208     {
5209       /* If length is not equal to 3 then only power of 2 is supported.  */
5210       gcc_assert (exact_log2 (length) != -1);
5211
5212       for (i = 0; i < nelt; ++i)
5213         sel[i] = i * 2;
5214       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5215
5216       for (i = 0; i < nelt; ++i)
5217         sel[i] = i * 2 + 1;
5218       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5219
5220       for (i = 0; i < log_length; i++)
5221         {
5222           for (j = 0; j < length; j += 2)
5223             {
5224               first_vect = dr_chain[j];
5225               second_vect = dr_chain[j+1];
5226
5227               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5228               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5229               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5230                                                first_vect, second_vect,
5231                                                perm_mask_even);
5232               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5233               (*result_chain)[j/2] = data_ref;
5234
5235               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5236               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5237               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5238                                                first_vect, second_vect,
5239                                                perm_mask_odd);
5240               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241               (*result_chain)[j/2+length/2] = data_ref;
5242             }
5243           memcpy (dr_chain.address (), result_chain->address (),
5244                   length * sizeof (tree));
5245         }
5246     }
5247 }
5248
5249 /* Function vect_shift_permute_load_chain.
5250
5251    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5252    sequence of stmts to reorder the input data accordingly.
5253    Return the final references for loads in RESULT_CHAIN.
5254    Return true if successed, false otherwise.
5255
5256    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5257    The input is 3 vectors each containing 8 elements.  We assign a
5258    number to each element, the input sequence is:
5259
5260    1st vec:   0  1  2  3  4  5  6  7
5261    2nd vec:   8  9 10 11 12 13 14 15
5262    3rd vec:  16 17 18 19 20 21 22 23
5263
5264    The output sequence should be:
5265
5266    1st vec:  0 3 6  9 12 15 18 21
5267    2nd vec:  1 4 7 10 13 16 19 22
5268    3rd vec:  2 5 8 11 14 17 20 23
5269
5270    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5271
5272    First we shuffle all 3 vectors to get correct elements order:
5273
5274    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5275    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5276    3rd vec:  (16 19 22) (17 20 23) (18 21)
5277
5278    Next we unite and shift vector 3 times:
5279
5280    1st step:
5281      shift right by 6 the concatenation of:
5282      "1st vec" and  "2nd vec"
5283        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5284      "2nd vec" and  "3rd vec"
5285        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5286      "3rd vec" and  "1st vec"
5287        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5288                              | New vectors                   |
5289
5290      So that now new vectors are:
5291
5292      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5293      2nd vec:  (10 13) (16 19 22) (17 20 23)
5294      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5295
5296    2nd step:
5297      shift right by 5 the concatenation of:
5298      "1st vec" and  "3rd vec"
5299        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5300      "2nd vec" and  "1st vec"
5301        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5302      "3rd vec" and  "2nd vec"
5303        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5304                           | New vectors                   |
5305
5306      So that now new vectors are:
5307
5308      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5309      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5310      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5311
5312    3rd step:
5313      shift right by 5 the concatenation of:
5314      "1st vec" and  "1st vec"
5315        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5316      shift right by 3 the concatenation of:
5317      "2nd vec" and  "2nd vec"
5318                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5319                           | New vectors                   |
5320
5321      So that now all vectors are READY:
5322      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5323      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5324      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5325
5326    This algorithm is faster than one in vect_permute_load_chain if:
5327      1.  "shift of a concatination" is faster than general permutation.
5328          This is usually so.
5329      2.  The TARGET machine can't execute vector instructions in parallel.
5330          This is because each step of the algorithm depends on previous.
5331          The algorithm in vect_permute_load_chain is much more parallel.
5332
5333    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5334 */
5335
5336 static bool
5337 vect_shift_permute_load_chain (vec<tree> dr_chain,
5338                                unsigned int length,
5339                                gimple stmt,
5340                                gimple_stmt_iterator *gsi,
5341                                vec<tree> *result_chain)
5342 {
5343   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5344   tree perm2_mask1, perm2_mask2, perm3_mask;
5345   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5346   gimple perm_stmt;
5347
5348   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5349   unsigned int i;
5350   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5351   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5352   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5353   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5354
5355   result_chain->quick_grow (length);
5356   memcpy (result_chain->address (), dr_chain.address (),
5357           length * sizeof (tree));
5358
5359   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5360     {
5361       unsigned int j, log_length = exact_log2 (length);
5362       for (i = 0; i < nelt / 2; ++i)
5363         sel[i] = i * 2;
5364       for (i = 0; i < nelt / 2; ++i)
5365         sel[nelt / 2 + i] = i * 2 + 1;
5366       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5367         {
5368           if (dump_enabled_p ())
5369             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5370                              "shuffle of 2 fields structure is not \
5371                               supported by target\n");
5372           return false;
5373         }
5374       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5375
5376       for (i = 0; i < nelt / 2; ++i)
5377         sel[i] = i * 2 + 1;
5378       for (i = 0; i < nelt / 2; ++i)
5379         sel[nelt / 2 + i] = i * 2;
5380       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5381         {
5382           if (dump_enabled_p ())
5383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5384                              "shuffle of 2 fields structure is not \
5385                               supported by target\n");
5386           return false;
5387         }
5388       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5389
5390       /* Generating permutation constant to shift all elements.
5391          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5392       for (i = 0; i < nelt; i++)
5393         sel[i] = nelt / 2 + i;
5394       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5395         {
5396           if (dump_enabled_p ())
5397             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5398                              "shift permutation is not supported by target\n");
5399           return false;
5400         }
5401       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5402
5403       /* Generating permutation constant to select vector from 2.
5404          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5405       for (i = 0; i < nelt / 2; i++)
5406         sel[i] = i;
5407       for (i = nelt / 2; i < nelt; i++)
5408         sel[i] = nelt + i;
5409       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5410         {
5411           if (dump_enabled_p ())
5412             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5413                              "select is not supported by target\n");
5414           return false;
5415         }
5416       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5417
5418       for (i = 0; i < log_length; i++)
5419         {
5420           for (j = 0; j < length; j += 2)
5421             {
5422               first_vect = dr_chain[j];
5423               second_vect = dr_chain[j + 1];
5424
5425               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5426               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5427                                                first_vect, first_vect,
5428                                                perm2_mask1);
5429               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5430               vect[0] = data_ref;
5431
5432               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5433               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5434                                                second_vect, second_vect,
5435                                                perm2_mask2);
5436               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5437               vect[1] = data_ref;
5438
5439               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5440               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5441                                                vect[0], vect[1], shift1_mask);
5442               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5443               (*result_chain)[j/2 + length/2] = data_ref;
5444
5445               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5446               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5447                                                vect[0], vect[1], select_mask);
5448               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5449               (*result_chain)[j/2] = data_ref;
5450             }
5451           memcpy (dr_chain.address (), result_chain->address (),
5452                   length * sizeof (tree));
5453         }
5454       return true;
5455     }
5456   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5457     {
5458       unsigned int k = 0, l = 0;
5459
5460       /* Generating permutation constant to get all elements in rigth order.
5461          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5462       for (i = 0; i < nelt; i++)
5463         {
5464           if (3 * k + (l % 3) >= nelt)
5465             {
5466               k = 0;
5467               l += (3 - (nelt % 3));
5468             }
5469           sel[i] = 3 * k + (l % 3);
5470           k++;
5471         }
5472       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5473         {
5474           if (dump_enabled_p ())
5475             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5476                              "shuffle of 3 fields structure is not \
5477                               supported by target\n");
5478           return false;
5479         }
5480       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5481
5482       /* Generating permutation constant to shift all elements.
5483          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5484       for (i = 0; i < nelt; i++)
5485         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5486       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5487         {
5488           if (dump_enabled_p ())
5489             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5490                              "shift permutation is not supported by target\n");
5491           return false;
5492         }
5493       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5494
5495       /* Generating permutation constant to shift all elements.
5496          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5497       for (i = 0; i < nelt; i++)
5498         sel[i] = 2 * (nelt / 3) + 1 + i;
5499       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5500         {
5501           if (dump_enabled_p ())
5502             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5503                              "shift permutation is not supported by target\n");
5504           return false;
5505         }
5506       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5507
5508       /* Generating permutation constant to shift all elements.
5509          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5510       for (i = 0; i < nelt; i++)
5511         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5512       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5513         {
5514           if (dump_enabled_p ())
5515             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5516                              "shift permutation is not supported by target\n");
5517           return false;
5518         }
5519       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5520
5521       /* Generating permutation constant to shift all elements.
5522          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5523       for (i = 0; i < nelt; i++)
5524         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5525       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5526         {
5527           if (dump_enabled_p ())
5528             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5529                              "shift permutation is not supported by target\n");
5530           return false;
5531         }
5532       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5533
5534       for (k = 0; k < 3; k++)
5535         {
5536           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5537           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5538                                            dr_chain[k], dr_chain[k],
5539                                            perm3_mask);
5540           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5541           vect[k] = data_ref;
5542         }
5543
5544       for (k = 0; k < 3; k++)
5545         {
5546           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5547           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5548                                            vect[k % 3], vect[(k + 1) % 3],
5549                                            shift1_mask);
5550           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5551           vect_shift[k] = data_ref;
5552         }
5553
5554       for (k = 0; k < 3; k++)
5555         {
5556           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5557           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5558                                            vect_shift[(4 - k) % 3],
5559                                            vect_shift[(3 - k) % 3],
5560                                            shift2_mask);
5561           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5562           vect[k] = data_ref;
5563         }
5564
5565       (*result_chain)[3 - (nelt % 3)] = vect[2];
5566
5567       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5568       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5569                                        vect[0], shift3_mask);
5570       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5571       (*result_chain)[nelt % 3] = data_ref;
5572
5573       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5574       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5575                                        vect[1], shift4_mask);
5576       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5577       (*result_chain)[0] = data_ref;
5578       return true;
5579     }
5580   return false;
5581 }
5582
5583 /* Function vect_transform_grouped_load.
5584
5585    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5586    to perform their permutation and ascribe the result vectorized statements to
5587    the scalar statements.
5588 */
5589
5590 void
5591 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5592                              gimple_stmt_iterator *gsi)
5593 {
5594   machine_mode mode;
5595   vec<tree> result_chain = vNULL;
5596
5597   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5598      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5599      vectors, that are ready for vector computation.  */
5600   result_chain.create (size);
5601
5602   /* If reassociation width for vector type is 2 or greater target machine can
5603      execute 2 or more vector instructions in parallel.  Otherwise try to
5604      get chain for loads group using vect_shift_permute_load_chain.  */
5605   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5606   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5607       || exact_log2 (size) != -1
5608       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5609                                          gsi, &result_chain))
5610     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5611   vect_record_grouped_load_vectors (stmt, result_chain);
5612   result_chain.release ();
5613 }
5614
5615 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5616    generated as part of the vectorization of STMT.  Assign the statement
5617    for each vector to the associated scalar statement.  */
5618
5619 void
5620 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5621 {
5622   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5623   gimple next_stmt, new_stmt;
5624   unsigned int i, gap_count;
5625   tree tmp_data_ref;
5626
5627   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5628      Since we scan the chain starting from it's first node, their order
5629      corresponds the order of data-refs in RESULT_CHAIN.  */
5630   next_stmt = first_stmt;
5631   gap_count = 1;
5632   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5633     {
5634       if (!next_stmt)
5635         break;
5636
5637       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5638        code elimination pass later.  No need to check for the first stmt in
5639        the group, since it always exists.
5640        GROUP_GAP is the number of steps in elements from the previous
5641        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5642        correspond to the gaps.  */
5643       if (next_stmt != first_stmt
5644           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5645       {
5646         gap_count++;
5647         continue;
5648       }
5649
5650       while (next_stmt)
5651         {
5652           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5653           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5654              copies, and we put the new vector statement in the first available
5655              RELATED_STMT.  */
5656           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5657             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5658           else
5659             {
5660               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5661                 {
5662                   gimple prev_stmt =
5663                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5664                   gimple rel_stmt =
5665                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5666                   while (rel_stmt)
5667                     {
5668                       prev_stmt = rel_stmt;
5669                       rel_stmt =
5670                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5671                     }
5672
5673                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5674                     new_stmt;
5675                 }
5676             }
5677
5678           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5679           gap_count = 1;
5680           /* If NEXT_STMT accesses the same DR as the previous statement,
5681              put the same TMP_DATA_REF as its vectorized statement; otherwise
5682              get the next data-ref from RESULT_CHAIN.  */
5683           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5684             break;
5685         }
5686     }
5687 }
5688
5689 /* Function vect_force_dr_alignment_p.
5690
5691    Returns whether the alignment of a DECL can be forced to be aligned
5692    on ALIGNMENT bit boundary.  */
5693
5694 bool
5695 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5696 {
5697   if (TREE_CODE (decl) != VAR_DECL)
5698     return false;
5699
5700   if (decl_in_symtab_p (decl)
5701       && !symtab_node::get (decl)->can_increase_alignment_p ())
5702     return false;
5703
5704   if (TREE_STATIC (decl))
5705     return (alignment <= MAX_OFILE_ALIGNMENT);
5706   else
5707     return (alignment <= MAX_STACK_ALIGNMENT);
5708 }
5709
5710
5711 /* Return whether the data reference DR is supported with respect to its
5712    alignment.
5713    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5714    it is aligned, i.e., check if it is possible to vectorize it with different
5715    alignment.  */
5716
5717 enum dr_alignment_support
5718 vect_supportable_dr_alignment (struct data_reference *dr,
5719                                bool check_aligned_accesses)
5720 {
5721   gimple stmt = DR_STMT (dr);
5722   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5723   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5724   machine_mode mode = TYPE_MODE (vectype);
5725   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5726   struct loop *vect_loop = NULL;
5727   bool nested_in_vect_loop = false;
5728
5729   if (aligned_access_p (dr) && !check_aligned_accesses)
5730     return dr_aligned;
5731
5732   /* For now assume all conditional loads/stores support unaligned
5733      access without any special code.  */
5734   if (is_gimple_call (stmt)
5735       && gimple_call_internal_p (stmt)
5736       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5737           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5738     return dr_unaligned_supported;
5739
5740   if (loop_vinfo)
5741     {
5742       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5743       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5744     }
5745
5746   /* Possibly unaligned access.  */
5747
5748   /* We can choose between using the implicit realignment scheme (generating
5749      a misaligned_move stmt) and the explicit realignment scheme (generating
5750      aligned loads with a REALIGN_LOAD).  There are two variants to the
5751      explicit realignment scheme: optimized, and unoptimized.
5752      We can optimize the realignment only if the step between consecutive
5753      vector loads is equal to the vector size.  Since the vector memory
5754      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5755      is guaranteed that the misalignment amount remains the same throughout the
5756      execution of the vectorized loop.  Therefore, we can create the
5757      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5758      at the loop preheader.
5759
5760      However, in the case of outer-loop vectorization, when vectorizing a
5761      memory access in the inner-loop nested within the LOOP that is now being
5762      vectorized, while it is guaranteed that the misalignment of the
5763      vectorized memory access will remain the same in different outer-loop
5764      iterations, it is *not* guaranteed that is will remain the same throughout
5765      the execution of the inner-loop.  This is because the inner-loop advances
5766      with the original scalar step (and not in steps of VS).  If the inner-loop
5767      step happens to be a multiple of VS, then the misalignment remains fixed
5768      and we can use the optimized realignment scheme.  For example:
5769
5770       for (i=0; i<N; i++)
5771         for (j=0; j<M; j++)
5772           s += a[i+j];
5773
5774      When vectorizing the i-loop in the above example, the step between
5775      consecutive vector loads is 1, and so the misalignment does not remain
5776      fixed across the execution of the inner-loop, and the realignment cannot
5777      be optimized (as illustrated in the following pseudo vectorized loop):
5778
5779       for (i=0; i<N; i+=4)
5780         for (j=0; j<M; j++){
5781           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5782                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5783                          // (assuming that we start from an aligned address).
5784           }
5785
5786      We therefore have to use the unoptimized realignment scheme:
5787
5788       for (i=0; i<N; i+=4)
5789           for (j=k; j<M; j+=4)
5790           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5791                            // that the misalignment of the initial address is
5792                            // 0).
5793
5794      The loop can then be vectorized as follows:
5795
5796       for (k=0; k<4; k++){
5797         rt = get_realignment_token (&vp[k]);
5798         for (i=0; i<N; i+=4){
5799           v1 = vp[i+k];
5800           for (j=k; j<M; j+=4){
5801             v2 = vp[i+j+VS-1];
5802             va = REALIGN_LOAD <v1,v2,rt>;
5803             vs += va;
5804             v1 = v2;
5805           }
5806         }
5807     } */
5808
5809   if (DR_IS_READ (dr))
5810     {
5811       bool is_packed = false;
5812       tree type = (TREE_TYPE (DR_REF (dr)));
5813
5814       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5815           && (!targetm.vectorize.builtin_mask_for_load
5816               || targetm.vectorize.builtin_mask_for_load ()))
5817         {
5818           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5819           if ((nested_in_vect_loop
5820                && (TREE_INT_CST_LOW (DR_STEP (dr))
5821                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5822               || !loop_vinfo)
5823             return dr_explicit_realign;
5824           else
5825             return dr_explicit_realign_optimized;
5826         }
5827       if (!known_alignment_for_access_p (dr))
5828         is_packed = not_size_aligned (DR_REF (dr));
5829
5830       if ((TYPE_USER_ALIGN (type) && !is_packed)
5831           || targetm.vectorize.
5832                support_vector_misalignment (mode, type,
5833                                             DR_MISALIGNMENT (dr), is_packed))
5834         /* Can't software pipeline the loads, but can at least do them.  */
5835         return dr_unaligned_supported;
5836     }
5837   else
5838     {
5839       bool is_packed = false;
5840       tree type = (TREE_TYPE (DR_REF (dr)));
5841
5842       if (!known_alignment_for_access_p (dr))
5843         is_packed = not_size_aligned (DR_REF (dr));
5844
5845      if ((TYPE_USER_ALIGN (type) && !is_packed)
5846          || targetm.vectorize.
5847               support_vector_misalignment (mode, type,
5848                                            DR_MISALIGNMENT (dr), is_packed))
5849        return dr_unaligned_supported;
5850     }
5851
5852   /* Unsupported.  */
5853   return dr_unaligned_unsupported;
5854 }