gcc/tree-vectorizer.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 /* Loop Vectorization Pass.
  23
  24    This pass tries to vectorize loops. This first implementation focuses on
  25    simple inner-most loops, with no conditional control flow, and a set of
  26    simple operations which vector form can be expressed using existing
  27    tree codes (PLUS, MULT etc).
  28
  29    For example, the vectorizer transforms the following simple loop:
  30
  31         short a[N]; short b[N]; short c[N]; int i;
  32
  33         for (i=0; i<N; i++){
  34           a[i] = b[i] + c[i];
  35         }
  36
  37    as if it was manually vectorized by rewriting the source code into:
  38
  39         typedef int __attribute__((mode(V8HI))) v8hi;
  40         short a[N];  short b[N]; short c[N];   int i;
  41         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  42         v8hi va, vb, vc;
  43
  44         for (i=0; i<N/8; i++){
  45           vb = pb[i];
  46           vc = pc[i];
  47           va = vb + vc;
  48           pa[i] = va;
  49         }
  50
  51         The main entry to this pass is vectorize_loops(), in which
  52    the vectorizer applies a set of analyses on a given set of loops,
  53    followed by the actual vectorization transformation for the loops that
  54    had successfully passed the analysis phase.
  55
  56         Throughout this pass we make a distinction between two types of
  57    data: scalars (which are represented by SSA_NAMES), and memory references
  58    ("data-refs"). These two types of data require different handling both
  59    during analysis and transformation. The types of data-refs that the
  60    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  61    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  62    accesses are required to have a  simple (consecutive) access pattern.
  63
  64    Analysis phase:
  65    ===============
  66         The driver for the analysis phase is vect_analyze_loop_nest().
  67    It applies a set of analyses, some of which rely on the scalar evolution
  68    analyzer (scev) developed by Sebastian Pop.
  69
  70         During the analysis phase the vectorizer records some information
  71    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  72    loop, as well as general information about the loop as a whole, which is
  73    recorded in a "loop_vec_info" struct attached to each loop.
  74
  75    Transformation phase:
  76    =====================
  77         The loop transformation phase scans all the stmts in the loop, and
  78    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  79    the loop that needs to be vectorized. It insert the vector code sequence
  80    just before the scalar stmt S, and records a pointer to the vector code
  81    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
  82    attached to S). This pointer will be used for the vectorization of following
  83    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
  84    otherwise, we rely on dead code elimination for removing it.
  85
  86         For example, say stmt S1 was vectorized into stmt VS1:
  87
  88    VS1: vb = px[i];
  89    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  90    S2:  a = b;
  91
  92    To vectorize stmt S2, the vectorizer first finds the stmt that defines
  93    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
  94    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
  95    resulting sequence would be:
  96
  97    VS1: vb = px[i];
  98    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  99    VS2: va = vb;
 100    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 101
 102         Operands that are not SSA_NAMEs, are data-refs that appear in
 103    load/store operations (like 'x[i]' in S1), and are handled differently.
 104
 105    Target modeling:
 106    =================
 107         Currently the only target specific information that is used is the
 108    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 109    support different sizes of vectors, for now will need to specify one value
 110    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 111
 112         Since we only vectorize operations which vector form can be
 113    expressed using existing tree codes, to verify that an operation is
 114    supported, the vectorizer checks the relevant optab at the relevant
 115    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 116    the value found is CODE_FOR_nothing, then there's no target support, and
 117    we can't vectorize the stmt.
 118
 119    For additional information on this project see:
 120    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 121 */
 122
 123 #include "config.h"
 124 #include "system.h"
 125 #include "coretypes.h"
 126 #include "tm.h"
 127 #include "ggc.h"
 128 #include "tree.h"
 129 #include "target.h"
 130 #include "rtl.h"
 131 #include "basic-block.h"
 132 #include "diagnostic.h"
 133 #include "tree-flow.h"
 134 #include "tree-dump.h"
 135 #include "timevar.h"
 136 #include "cfgloop.h"
 137 #include "cfglayout.h"
 138 #include "expr.h"
 139 #include "recog.h"
 140 #include "optabs.h"
 141 #include "params.h"
 142 #include "toplev.h"
 143 #include "tree-chrec.h"
 144 #include "tree-data-ref.h"
 145 #include "tree-scalar-evolution.h"
 146 #include "input.h"
 147 #include "hashtab.h"
 148 #include "tree-vectorizer.h"
 149 #include "tree-pass.h"
 150 #include "langhooks.h"
 151
 152 /*************************************************************************
 153   General Vectorization Utilities
 154  *************************************************************************/
 155
 156 /* vect_dump will be set to stderr or dump_file if exist.  */
 157 FILE *vect_dump;
 158
 159 /* vect_verbosity_level set to an invalid value
 160    to mark that it's uninitialized.  */
 161 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
 162
 163 /* Loop location.  */
 164 static LOC vect_loop_location;
 165
 166 /* Bitmap of virtual variables to be renamed.  */
 167 bitmap vect_memsyms_to_rename;
 168
 169 /* Vector mapping GIMPLE stmt to stmt_vec_info. */
 170 VEC(vec_void_p,heap) *stmt_vec_info_vec;
 171
 172 \f
 173 /*************************************************************************
 174   Simple Loop Peeling Utilities
 175
 176   Utilities to support loop peeling for vectorization purposes.
 177  *************************************************************************/
 178
 179
 180 /* Renames the use *OP_P.  */
 181
 182 static void
 183 rename_use_op (use_operand_p op_p)
 184 {
 185   tree new_name;
 186
 187   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
 188     return;
 189
 190   new_name = get_current_def (USE_FROM_PTR (op_p));
 191
 192   /* Something defined outside of the loop.  */
 193   if (!new_name)
 194     return;
 195
 196   /* An ordinary ssa name defined in the loop.  */
 197
 198   SET_USE (op_p, new_name);
 199 }
 200
 201
 202 /* Renames the variables in basic block BB.  */
 203
 204 void
 205 rename_variables_in_bb (basic_block bb)
 206 {
 207   gimple_stmt_iterator gsi;
 208   gimple stmt;
 209   use_operand_p use_p;
 210   ssa_op_iter iter;
 211   edge e;
 212   edge_iterator ei;
 213   struct loop *loop = bb->loop_father;
 214
 215   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 216     {
 217       stmt = gsi_stmt (gsi);
 218       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 219         rename_use_op (use_p);
 220     }
 221
 222   FOR_EACH_EDGE (e, ei, bb->succs)
 223     {
 224       if (!flow_bb_inside_loop_p (loop, e->dest))
 225         continue;
 226       for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
 227         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi_stmt (gsi), e));
 228     }
 229 }
 230
 231
 232 /* Renames variables in new generated LOOP.  */
 233
 234 void
 235 rename_variables_in_loop (struct loop *loop)
 236 {
 237   unsigned i;
 238   basic_block *bbs;
 239
 240   bbs = get_loop_body (loop);
 241
 242   for (i = 0; i < loop->num_nodes; i++)
 243     rename_variables_in_bb (bbs[i]);
 244
 245   free (bbs);
 246 }
 247
 248
 249 /* Update the PHI nodes of NEW_LOOP.
 250
 251    NEW_LOOP is a duplicate of ORIG_LOOP.
 252    AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
 253    AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
 254    executes before it.  */
 255
 256 static void
 257 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
 258                                        struct loop *new_loop, bool after)
 259 {
 260   tree new_ssa_name;
 261   gimple phi_new, phi_orig;
 262   tree def;
 263   edge orig_loop_latch = loop_latch_edge (orig_loop);
 264   edge orig_entry_e = loop_preheader_edge (orig_loop);
 265   edge new_loop_exit_e = single_exit (new_loop);
 266   edge new_loop_entry_e = loop_preheader_edge (new_loop);
 267   edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
 268   gimple_stmt_iterator gsi_new, gsi_orig;
 269
 270   /*
 271      step 1. For each loop-header-phi:
 272              Add the first phi argument for the phi in NEW_LOOP
 273             (the one associated with the entry of NEW_LOOP)
 274
 275      step 2. For each loop-header-phi:
 276              Add the second phi argument for the phi in NEW_LOOP
 277             (the one associated with the latch of NEW_LOOP)
 278
 279      step 3. Update the phis in the successor block of NEW_LOOP.
 280
 281         case 1: NEW_LOOP was placed before ORIG_LOOP:
 282                 The successor block of NEW_LOOP is the header of ORIG_LOOP.
 283                 Updating the phis in the successor block can therefore be done
 284                 along with the scanning of the loop header phis, because the
 285                 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
 286                 phi nodes, organized in the same order.
 287
 288         case 2: NEW_LOOP was placed after ORIG_LOOP:
 289                 The successor block of NEW_LOOP is the original exit block of
 290                 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
 291                 We postpone updating these phis to a later stage (when
 292                 loop guards are added).
 293    */
 294
 295
 296   /* Scan the phis in the headers of the old and new loops
 297      (they are organized in exactly the same order).  */
 298
 299   for (gsi_new = gsi_start_phis (new_loop->header),
 300        gsi_orig = gsi_start_phis (orig_loop->header);
 301        !gsi_end_p (gsi_new) && !gsi_end_p (gsi_orig);
 302        gsi_next (&gsi_new), gsi_next (&gsi_orig))
 303     {
 304       phi_new = gsi_stmt (gsi_new);
 305       phi_orig = gsi_stmt (gsi_orig);
 306
 307       /* step 1.  */
 308       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
 309       add_phi_arg (phi_new, def, new_loop_entry_e);
 310
 311       /* step 2.  */
 312       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
 313       if (TREE_CODE (def) != SSA_NAME)
 314         continue;
 315
 316       new_ssa_name = get_current_def (def);
 317       if (!new_ssa_name)
 318         {
 319           /* This only happens if there are no definitions
 320              inside the loop. use the phi_result in this case.  */
 321           new_ssa_name = PHI_RESULT (phi_new);
 322         }
 323
 324       /* An ordinary ssa name defined in the loop.  */
 325       add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
 326
 327       /* step 3 (case 1).  */
 328       if (!after)
 329         {
 330           gcc_assert (new_loop_exit_e == orig_entry_e);
 331           SET_PHI_ARG_DEF (phi_orig,
 332                            new_loop_exit_e->dest_idx,
 333                            new_ssa_name);
 334         }
 335     }
 336 }
 337
 338
 339 /* Update PHI nodes for a guard of the LOOP.
 340
 341    Input:
 342    - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
 343         controls whether LOOP is to be executed.  GUARD_EDGE is the edge that
 344         originates from the guard-bb, skips LOOP and reaches the (unique) exit
 345         bb of LOOP.  This loop-exit-bb is an empty bb with one successor.
 346         We denote this bb NEW_MERGE_BB because before the guard code was added
 347         it had a single predecessor (the LOOP header), and now it became a merge
 348         point of two paths - the path that ends with the LOOP exit-edge, and
 349         the path that ends with GUARD_EDGE.
 350    - NEW_EXIT_BB: New basic block that is added by this function between LOOP
 351         and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
 352
 353    ===> The CFG before the guard-code was added:
 354         LOOP_header_bb:
 355           loop_body
 356           if (exit_loop) goto update_bb
 357           else           goto LOOP_header_bb
 358         update_bb:
 359
 360    ==> The CFG after the guard-code was added:
 361         guard_bb:
 362           if (LOOP_guard_condition) goto new_merge_bb
 363           else                      goto LOOP_header_bb
 364         LOOP_header_bb:
 365           loop_body
 366           if (exit_loop_condition) goto new_merge_bb
 367           else                     goto LOOP_header_bb
 368         new_merge_bb:
 369           goto update_bb
 370         update_bb:
 371
 372    ==> The CFG after this function:
 373         guard_bb:
 374           if (LOOP_guard_condition) goto new_merge_bb
 375           else                      goto LOOP_header_bb
 376         LOOP_header_bb:
 377           loop_body
 378           if (exit_loop_condition) goto new_exit_bb
 379           else                     goto LOOP_header_bb
 380         new_exit_bb:
 381         new_merge_bb:
 382           goto update_bb
 383         update_bb:
 384
 385    This function:
 386    1. creates and updates the relevant phi nodes to account for the new
 387       incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
 388       1.1. Create phi nodes at NEW_MERGE_BB.
 389       1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
 390            UPDATE_BB).  UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
 391    2. preserves loop-closed-ssa-form by creating the required phi nodes
 392       at the exit of LOOP (i.e, in NEW_EXIT_BB).
 393
 394    There are two flavors to this function:
 395
 396    slpeel_update_phi_nodes_for_guard1:
 397      Here the guard controls whether we enter or skip LOOP, where LOOP is a
 398      prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
 399      for variables that have phis in the loop header.
 400
 401    slpeel_update_phi_nodes_for_guard2:
 402      Here the guard controls whether we enter or skip LOOP, where LOOP is an
 403      epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
 404      for variables that have phis in the loop exit.
 405
 406    I.E., the overall structure is:
 407
 408         loop1_preheader_bb:
 409                 guard1 (goto loop1/merge1_bb)
 410         loop1
 411         loop1_exit_bb:
 412                 guard2 (goto merge1_bb/merge2_bb)
 413         merge1_bb
 414         loop2
 415         loop2_exit_bb
 416         merge2_bb
 417         next_bb
 418
 419    slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
 420    loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
 421    that have phis in loop1->header).
 422
 423    slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
 424    loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
 425    that have phis in next_bb). It also adds some of these phis to
 426    loop1_exit_bb.
 427
 428    slpeel_update_phi_nodes_for_guard1 is always called before
 429    slpeel_update_phi_nodes_for_guard2. They are both needed in order
 430    to create correct data-flow and loop-closed-ssa-form.
 431
 432    Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
 433    that change between iterations of a loop (and therefore have a phi-node
 434    at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
 435    phis for variables that are used out of the loop (and therefore have
 436    loop-closed exit phis). Some variables may be both updated between
 437    iterations and used after the loop. This is why in loop1_exit_bb we
 438    may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
 439    and exit phis (created by slpeel_update_phi_nodes_for_guard2).
 440
 441    - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
 442      an original loop. i.e., we have:
 443
 444            orig_loop
 445            guard_bb (goto LOOP/new_merge)
 446            new_loop <-- LOOP
 447            new_exit
 448            new_merge
 449            next_bb
 450
 451      If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
 452      have:
 453
 454            new_loop
 455            guard_bb (goto LOOP/new_merge)
 456            orig_loop <-- LOOP
 457            new_exit
 458            new_merge
 459            next_bb
 460
 461      The SSA names defined in the original loop have a current
 462      reaching definition that that records the corresponding new
 463      ssa-name used in the new duplicated loop copy.
 464   */
 465
 466 /* Function slpeel_update_phi_nodes_for_guard1
 467
 468    Input:
 469    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 470    - DEFS - a bitmap of ssa names to mark new names for which we recorded
 471             information.
 472
 473    In the context of the overall structure, we have:
 474
 475         loop1_preheader_bb:
 476                 guard1 (goto loop1/merge1_bb)
 477 LOOP->  loop1
 478         loop1_exit_bb:
 479                 guard2 (goto merge1_bb/merge2_bb)
 480         merge1_bb
 481         loop2
 482         loop2_exit_bb
 483         merge2_bb
 484         next_bb
 485
 486    For each name updated between loop iterations (i.e - for each name that has
 487    an entry (loop-header) phi in LOOP) we create a new phi in:
 488    1. merge1_bb (to account for the edge from guard1)
 489    2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 490 */
 491
 492 static void
 493 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
 494                                     bool is_new_loop, basic_block *new_exit_bb,
 495                                     bitmap *defs)
 496 {
 497   gimple orig_phi, new_phi;
 498   gimple update_phi, update_phi2;
 499   tree guard_arg, loop_arg;
 500   basic_block new_merge_bb = guard_edge->dest;
 501   edge e = EDGE_SUCC (new_merge_bb, 0);
 502   basic_block update_bb = e->dest;
 503   basic_block orig_bb = loop->header;
 504   edge new_exit_e;
 505   tree current_new_name;
 506   tree name;
 507   gimple_stmt_iterator gsi_orig, gsi_update;
 508
 509   /* Create new bb between loop and new_merge_bb.  */
 510   *new_exit_bb = split_edge (single_exit (loop));
 511
 512   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 513
 514   for (gsi_orig = gsi_start_phis (orig_bb),
 515        gsi_update = gsi_start_phis (update_bb);
 516        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
 517        gsi_next (&gsi_orig), gsi_next (&gsi_update))
 518     {
 519       orig_phi = gsi_stmt (gsi_orig);
 520       update_phi = gsi_stmt (gsi_update);
 521
 522       /* Virtual phi; Mark it for renaming. We actually want to call
 523          mar_sym_for_renaming, but since all ssa renaming datastructures
 524          are going to be freed before we get to call ssa_update, we just
 525          record this name for now in a bitmap, and will mark it for
 526          renaming later.  */
 527       name = PHI_RESULT (orig_phi);
 528       if (!is_gimple_reg (SSA_NAME_VAR (name)))
 529         bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
 530
 531       /** 1. Handle new-merge-point phis  **/
 532
 533       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 534       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 535                                  new_merge_bb);
 536
 537       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 538             of LOOP. Set the two phi args in NEW_PHI for these edges:  */
 539       loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
 540       guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
 541
 542       add_phi_arg (new_phi, loop_arg, new_exit_e);
 543       add_phi_arg (new_phi, guard_arg, guard_edge);
 544
 545       /* 1.3. Update phi in successor block.  */
 546       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
 547                   || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
 548       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 549       update_phi2 = new_phi;
 550
 551
 552       /** 2. Handle loop-closed-ssa-form phis  **/
 553
 554       if (!is_gimple_reg (PHI_RESULT (orig_phi)))
 555         continue;
 556
 557       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 558       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 559                                  *new_exit_bb);
 560
 561       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 562       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 563
 564       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 565       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 566       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 567
 568       /* 2.4. Record the newly created name with set_current_def.
 569          We want to find a name such that
 570                 name = get_current_def (orig_loop_name)
 571          and to set its current definition as follows:
 572                 set_current_def (name, new_phi_name)
 573
 574          If LOOP is a new loop then loop_arg is already the name we're
 575          looking for. If LOOP is the original loop, then loop_arg is
 576          the orig_loop_name and the relevant name is recorded in its
 577          current reaching definition.  */
 578       if (is_new_loop)
 579         current_new_name = loop_arg;
 580       else
 581         {
 582           current_new_name = get_current_def (loop_arg);
 583           /* current_def is not available only if the variable does not
 584              change inside the loop, in which case we also don't care
 585              about recording a current_def for it because we won't be
 586              trying to create loop-exit-phis for it.  */
 587           if (!current_new_name)
 588             continue;
 589         }
 590       gcc_assert (get_current_def (current_new_name) == NULL_TREE);
 591
 592       set_current_def (current_new_name, PHI_RESULT (new_phi));
 593       bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
 594     }
 595 }
 596
 597
 598 /* Function slpeel_update_phi_nodes_for_guard2
 599
 600    Input:
 601    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 602
 603    In the context of the overall structure, we have:
 604
 605         loop1_preheader_bb:
 606                 guard1 (goto loop1/merge1_bb)
 607         loop1
 608         loop1_exit_bb:
 609                 guard2 (goto merge1_bb/merge2_bb)
 610         merge1_bb
 611 LOOP->  loop2
 612         loop2_exit_bb
 613         merge2_bb
 614         next_bb
 615
 616    For each name used out side the loop (i.e - for each name that has an exit
 617    phi in next_bb) we create a new phi in:
 618    1. merge2_bb (to account for the edge from guard_bb)
 619    2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 620    3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
 621       if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
 622 */
 623
 624 static void
 625 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
 626                                     bool is_new_loop, basic_block *new_exit_bb)
 627 {
 628   gimple orig_phi, new_phi;
 629   gimple update_phi, update_phi2;
 630   tree guard_arg, loop_arg;
 631   basic_block new_merge_bb = guard_edge->dest;
 632   edge e = EDGE_SUCC (new_merge_bb, 0);
 633   basic_block update_bb = e->dest;
 634   edge new_exit_e;
 635   tree orig_def, orig_def_new_name;
 636   tree new_name, new_name2;
 637   tree arg;
 638   gimple_stmt_iterator gsi;
 639
 640   /* Create new bb between loop and new_merge_bb.  */
 641   *new_exit_bb = split_edge (single_exit (loop));
 642
 643   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 644
 645   for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi))
 646     {
 647       update_phi = gsi_stmt (gsi);
 648       orig_phi = update_phi;
 649       orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
 650       /* This loop-closed-phi actually doesn't represent a use
 651          out of the loop - the phi arg is a constant.  */
 652       if (TREE_CODE (orig_def) != SSA_NAME)
 653         continue;
 654       orig_def_new_name = get_current_def (orig_def);
 655       arg = NULL_TREE;
 656
 657       /** 1. Handle new-merge-point phis  **/
 658
 659       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 660       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 661                                  new_merge_bb);
 662
 663       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 664             of LOOP. Set the two PHI args in NEW_PHI for these edges:  */
 665       new_name = orig_def;
 666       new_name2 = NULL_TREE;
 667       if (orig_def_new_name)
 668         {
 669           new_name = orig_def_new_name;
 670           /* Some variables have both loop-entry-phis and loop-exit-phis.
 671              Such variables were given yet newer names by phis placed in
 672              guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
 673              new_name2 = get_current_def (get_current_def (orig_name)).  */
 674           new_name2 = get_current_def (new_name);
 675         }
 676
 677       if (is_new_loop)
 678         {
 679           guard_arg = orig_def;
 680           loop_arg = new_name;
 681         }
 682       else
 683         {
 684           guard_arg = new_name;
 685           loop_arg = orig_def;
 686         }
 687       if (new_name2)
 688         guard_arg = new_name2;
 689
 690       add_phi_arg (new_phi, loop_arg, new_exit_e);
 691       add_phi_arg (new_phi, guard_arg, guard_edge);
 692
 693       /* 1.3. Update phi in successor block.  */
 694       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
 695       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 696       update_phi2 = new_phi;
 697
 698
 699       /** 2. Handle loop-closed-ssa-form phis  **/
 700
 701       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 702       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 703                                  *new_exit_bb);
 704
 705       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 706       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 707
 708       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 709       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 710       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 711
 712
 713       /** 3. Handle loop-closed-ssa-form phis for first loop  **/
 714
 715       /* 3.1. Find the relevant names that need an exit-phi in
 716          GUARD_BB, i.e. names for which
 717          slpeel_update_phi_nodes_for_guard1 had not already created a
 718          phi node. This is the case for names that are used outside
 719          the loop (and therefore need an exit phi) but are not updated
 720          across loop iterations (and therefore don't have a
 721          loop-header-phi).
 722
 723          slpeel_update_phi_nodes_for_guard1 is responsible for
 724          creating loop-exit phis in GUARD_BB for names that have a
 725          loop-header-phi.  When such a phi is created we also record
 726          the new name in its current definition.  If this new name
 727          exists, then guard_arg was set to this new name (see 1.2
 728          above).  Therefore, if guard_arg is not this new name, this
 729          is an indication that an exit-phi in GUARD_BB was not yet
 730          created, so we take care of it here.  */
 731       if (guard_arg == new_name2)
 732         continue;
 733       arg = guard_arg;
 734
 735       /* 3.2. Generate new phi node in GUARD_BB:  */
 736       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 737                                  guard_edge->src);
 738
 739       /* 3.3. GUARD_BB has one incoming edge:  */
 740       gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
 741       add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
 742
 743       /* 3.4. Update phi in successor of GUARD_BB:  */
 744       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
 745                                                                 == guard_arg);
 746       SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
 747     }
 748 }
 749
 750
 751 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
 752    that starts at zero, increases by one and its limit is NITERS.
 753
 754    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 755
 756 void
 757 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
 758 {
 759   tree indx_before_incr, indx_after_incr;
 760   gimple cond_stmt;
 761   gimple orig_cond;
 762   edge exit_edge = single_exit (loop);
 763   gimple_stmt_iterator loop_cond_gsi;
 764   gimple_stmt_iterator incr_gsi;
 765   bool insert_after;
 766   tree init = build_int_cst (TREE_TYPE (niters), 0);
 767   tree step = build_int_cst (TREE_TYPE (niters), 1);
 768   LOC loop_loc;
 769   enum tree_code code;
 770
 771   orig_cond = get_loop_exit_condition (loop);
 772   gcc_assert (orig_cond);
 773   loop_cond_gsi = gsi_for_stmt (orig_cond);
 774
 775   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 776   create_iv (init, step, NULL_TREE, loop,
 777              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 778
 779   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 780                                               true, NULL_TREE, true,
 781                                               GSI_SAME_STMT);
 782   niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE,
 783                                      true, GSI_SAME_STMT);
 784
 785   code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 786   cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
 787                                  NULL_TREE);
 788
 789   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 790
 791   /* Remove old loop exit test:  */
 792   gsi_remove (&loop_cond_gsi, true);
 793
 794   loop_loc = find_loop_location (loop);
 795   if (dump_file && (dump_flags & TDF_DETAILS))
 796     {
 797       if (loop_loc != UNKNOWN_LOC)
 798         fprintf (dump_file, "\nloop at %s:%d: ",
 799                  LOC_FILE (loop_loc), LOC_LINE (loop_loc));
 800       print_gimple_stmt (dump_file, cond_stmt, 0, TDF_SLIM);
 801     }
 802
 803   loop->nb_iterations = niters;
 804 }
 805
 806
 807 /* Given LOOP this function generates a new copy of it and puts it
 808    on E which is either the entry or exit of LOOP.  */
 809
 810 struct loop *
 811 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
 812 {
 813   struct loop *new_loop;
 814   basic_block *new_bbs, *bbs;
 815   bool at_exit;
 816   bool was_imm_dom;
 817   basic_block exit_dest;
 818   gimple phi;
 819   tree phi_arg;
 820   edge exit, new_exit;
 821   gimple_stmt_iterator gsi;
 822
 823   at_exit = (e == single_exit (loop));
 824   if (!at_exit && e != loop_preheader_edge (loop))
 825     return NULL;
 826
 827   bbs = get_loop_body (loop);
 828
 829   /* Check whether duplication is possible.  */
 830   if (!can_copy_bbs_p (bbs, loop->num_nodes))
 831     {
 832       free (bbs);
 833       return NULL;
 834     }
 835
 836   /* Generate new loop structure.  */
 837   new_loop = duplicate_loop (loop, loop_outer (loop));
 838   if (!new_loop)
 839     {
 840       free (bbs);
 841       return NULL;
 842     }
 843
 844   exit_dest = single_exit (loop)->dest;
 845   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
 846                                           exit_dest) == loop->header ?
 847                  true : false);
 848
 849   new_bbs = XNEWVEC (basic_block, loop->num_nodes);
 850
 851   exit = single_exit (loop);
 852   copy_bbs (bbs, loop->num_nodes, new_bbs,
 853             &exit, 1, &new_exit, NULL,
 854             e->src);
 855
 856   /* Duplicating phi args at exit bbs as coming
 857      also from exit of duplicated loop.  */
 858   for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); gsi_next (&gsi))
 859     {
 860       phi = gsi_stmt (gsi);
 861       phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
 862       if (phi_arg)
 863         {
 864           edge new_loop_exit_edge;
 865
 866           if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
 867             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
 868           else
 869             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
 870
 871           add_phi_arg (phi, phi_arg, new_loop_exit_edge);
 872         }
 873     }
 874
 875   if (at_exit) /* Add the loop copy at exit.  */
 876     {
 877       redirect_edge_and_branch_force (e, new_loop->header);
 878       PENDING_STMT (e) = NULL;
 879       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
 880       if (was_imm_dom)
 881         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
 882     }
 883   else /* Add the copy at entry.  */
 884     {
 885       edge new_exit_e;
 886       edge entry_e = loop_preheader_edge (loop);
 887       basic_block preheader = entry_e->src;
 888
 889       if (!flow_bb_inside_loop_p (new_loop,
 890                                   EDGE_SUCC (new_loop->header, 0)->dest))
 891         new_exit_e = EDGE_SUCC (new_loop->header, 0);
 892       else
 893         new_exit_e = EDGE_SUCC (new_loop->header, 1);
 894
 895       redirect_edge_and_branch_force (new_exit_e, loop->header);
 896       PENDING_STMT (new_exit_e) = NULL;
 897       set_immediate_dominator (CDI_DOMINATORS, loop->header,
 898                                new_exit_e->src);
 899
 900       /* We have to add phi args to the loop->header here as coming
 901          from new_exit_e edge.  */
 902       for (gsi = gsi_start_phis (loop->header);
 903            !gsi_end_p (gsi);
 904            gsi_next (&gsi))
 905         {
 906           phi = gsi_stmt (gsi);
 907           phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
 908           if (phi_arg)
 909             add_phi_arg (phi, phi_arg, new_exit_e);
 910         }
 911
 912       redirect_edge_and_branch_force (entry_e, new_loop->header);
 913       PENDING_STMT (entry_e) = NULL;
 914       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
 915     }
 916
 917   free (new_bbs);
 918   free (bbs);
 919
 920   return new_loop;
 921 }
 922
 923
 924 /* Given the condition statement COND, put it as the last statement
 925    of GUARD_BB; EXIT_BB is the basic block to skip the loop;
 926    Assumes that this is the single exit of the guarded loop.
 927    Returns the skip edge.  */
 928
 929 static edge
 930 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
 931                        basic_block dom_bb)
 932 {
 933   gimple_stmt_iterator gsi;
 934   edge new_e, enter_e;
 935   gimple cond_stmt;
 936   gimple_seq gimplify_stmt_list = NULL;
 937
 938   enter_e = EDGE_SUCC (guard_bb, 0);
 939   enter_e->flags &= ~EDGE_FALLTHRU;
 940   enter_e->flags |= EDGE_FALSE_VALUE;
 941   gsi = gsi_last_bb (guard_bb);
 942
 943   cond = force_gimple_operand (cond, &gimplify_stmt_list, true, NULL_TREE);
 944   cond_stmt = gimple_build_cond (NE_EXPR,
 945                                  cond, build_int_cst (TREE_TYPE (cond), 0),
 946                                  NULL_TREE, NULL_TREE);
 947   if (gimplify_stmt_list)
 948     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
 949
 950   gsi = gsi_last_bb (guard_bb);
 951   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
 952
 953   /* Add new edge to connect guard block to the merge/loop-exit block.  */
 954   new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
 955   set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
 956   return new_e;
 957 }
 958
 959
 960 /* This function verifies that the following restrictions apply to LOOP:
 961    (1) it is innermost
 962    (2) it consists of exactly 2 basic blocks - header, and an empty latch.
 963    (3) it is single entry, single exit
 964    (4) its exit condition is the last stmt in the header
 965    (5) E is the entry/exit edge of LOOP.
 966  */
 967
 968 bool
 969 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
 970 {
 971   edge exit_e = single_exit (loop);
 972   edge entry_e = loop_preheader_edge (loop);
 973   gimple orig_cond = get_loop_exit_condition (loop);
 974   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
 975
 976   if (need_ssa_update_p ())
 977     return false;
 978
 979   if (loop->inner
 980       /* All loops have an outer scope; the only case loop->outer is NULL is for
 981          the function itself.  */
 982       || !loop_outer (loop)
 983       || loop->num_nodes != 2
 984       || !empty_block_p (loop->latch)
 985       || !single_exit (loop)
 986       /* Verify that new loop exit condition can be trivially modified.  */
 987       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
 988       || (e != exit_e && e != entry_e))
 989     return false;
 990
 991   return true;
 992 }
 993
 994 #ifdef ENABLE_CHECKING
 995 void
 996 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
 997                                  struct loop *second_loop)
 998 {
 999   basic_block loop1_exit_bb = single_exit (first_loop)->dest;
1000   basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
1001   basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
1002
1003   /* A guard that controls whether the second_loop is to be executed or skipped
1004      is placed in first_loop->exit.  first_loop->exit therefore has two
1005      successors - one is the preheader of second_loop, and the other is a bb
1006      after second_loop.
1007    */
1008   gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
1009
1010   /* 1. Verify that one of the successors of first_loop->exit is the preheader
1011         of second_loop.  */
1012
1013   /* The preheader of new_loop is expected to have two predecessors:
1014      first_loop->exit and the block that precedes first_loop.  */
1015
1016   gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
1017               && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
1018                    && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
1019                || (EDGE_PRED (loop2_entry_bb, 1)->src ==  loop1_exit_bb
1020                    && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
1021
1022   /* Verify that the other successor of first_loop->exit is after the
1023      second_loop.  */
1024   /* TODO */
1025 }
1026 #endif
1027
1028 /* If the run time cost model check determines that vectorization is
1029    not profitable and hence scalar loop should be generated then set
1030    FIRST_NITERS to prologue peeled iterations. This will allow all the
1031    iterations to be executed in the prologue peeled scalar loop.  */
1032
1033 void
1034 set_prologue_iterations (basic_block bb_before_first_loop,
1035                          tree first_niters,
1036                          struct loop *loop,
1037                          unsigned int th)
1038 {
1039   edge e;
1040   basic_block cond_bb, then_bb;
1041   tree var, prologue_after_cost_adjust_name;
1042   gimple_stmt_iterator gsi;
1043   gimple newphi;
1044   edge e_true, e_false, e_fallthru;
1045   gimple cond_stmt;
1046   gimple_seq gimplify_stmt_list = NULL, stmts = NULL;
1047   tree cost_pre_condition = NULL_TREE;
1048   tree scalar_loop_iters =
1049     unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1050
1051   e = single_pred_edge (bb_before_first_loop);
1052   cond_bb = split_edge(e);
1053
1054   e = single_pred_edge (bb_before_first_loop);
1055   then_bb = split_edge(e);
1056   set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1057
1058   e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1059                                    EDGE_FALSE_VALUE);
1060   set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1061
1062   e_true = EDGE_PRED (then_bb, 0);
1063   e_true->flags &= ~EDGE_FALLTHRU;
1064   e_true->flags |= EDGE_TRUE_VALUE;
1065
1066   e_fallthru = EDGE_SUCC (then_bb, 0);
1067
1068   cost_pre_condition =
1069     build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1070             build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1071   cost_pre_condition =
1072     force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1073                           true, NULL_TREE);
1074   cond_stmt = gimple_build_cond (NE_EXPR, cost_pre_condition,
1075                                  build_int_cst (TREE_TYPE (cost_pre_condition),
1076                                                 0), NULL_TREE, NULL_TREE);
1077
1078   gsi = gsi_last_bb (cond_bb);
1079   if (gimplify_stmt_list)
1080     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1081
1082   gsi = gsi_last_bb (cond_bb);
1083   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1084
1085   var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1086                         "prologue_after_cost_adjust");
1087   add_referenced_var (var);
1088   prologue_after_cost_adjust_name =
1089     force_gimple_operand (scalar_loop_iters, &stmts, false, var);
1090
1091   gsi = gsi_last_bb (then_bb);
1092   if (stmts)
1093     gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
1094
1095   newphi = create_phi_node (var, bb_before_first_loop);
1096   add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1097   add_phi_arg (newphi, first_niters, e_false);
1098
1099   first_niters = PHI_RESULT (newphi);
1100 }
1101
1102
1103 /* Function slpeel_tree_peel_loop_to_edge.
1104
1105    Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1106    that is placed on the entry (exit) edge E of LOOP. After this transformation
1107    we have two loops one after the other - first-loop iterates FIRST_NITERS
1108    times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1109    If the cost model indicates that it is profitable to emit a scalar
1110    loop instead of the vector one, then the prolog (epilog) loop will iterate
1111    for the entire unchanged scalar iterations of the loop.
1112
1113    Input:
1114    - LOOP: the loop to be peeled.
1115    - E: the exit or entry edge of LOOP.
1116         If it is the entry edge, we peel the first iterations of LOOP. In this
1117         case first-loop is LOOP, and second-loop is the newly created loop.
1118         If it is the exit edge, we peel the last iterations of LOOP. In this
1119         case, first-loop is the newly created loop, and second-loop is LOOP.
1120    - NITERS: the number of iterations that LOOP iterates.
1121    - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1122    - UPDATE_FIRST_LOOP_COUNT:  specified whether this function is responsible
1123         for updating the loop bound of the first-loop to FIRST_NITERS.  If it
1124         is false, the caller of this function may want to take care of this
1125         (this can be useful if we don't want new stmts added to first-loop).
1126    - TH: cost model profitability threshold of iterations for vectorization.
1127    - CHECK_PROFITABILITY: specify whether cost model check has not occurred
1128                           during versioning and hence needs to occur during
1129                           prologue generation or whether cost model check
1130                           has not occurred during prologue generation and hence
1131                           needs to occur during epilogue generation.
1132
1133
1134    Output:
1135    The function returns a pointer to the new loop-copy, or NULL if it failed
1136    to perform the transformation.
1137
1138    The function generates two if-then-else guards: one before the first loop,
1139    and the other before the second loop:
1140    The first guard is:
1141      if (FIRST_NITERS == 0) then skip the first loop,
1142      and go directly to the second loop.
1143    The second guard is:
1144      if (FIRST_NITERS == NITERS) then skip the second loop.
1145
1146    FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1147    FORNOW the resulting code will not be in loop-closed-ssa form.
1148 */
1149
1150 struct loop*
1151 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1152                                edge e, tree first_niters,
1153                                tree niters, bool update_first_loop_count,
1154                                unsigned int th, bool check_profitability)
1155 {
1156   struct loop *new_loop = NULL, *first_loop, *second_loop;
1157   edge skip_e;
1158   tree pre_condition = NULL_TREE;
1159   bitmap definitions;
1160   basic_block bb_before_second_loop, bb_after_second_loop;
1161   basic_block bb_before_first_loop;
1162   basic_block bb_between_loops;
1163   basic_block new_exit_bb;
1164   edge exit_e = single_exit (loop);
1165   LOC loop_loc;
1166   tree cost_pre_condition = NULL_TREE;
1167
1168   if (!slpeel_can_duplicate_loop_p (loop, e))
1169     return NULL;
1170
1171   /* We have to initialize cfg_hooks. Then, when calling
1172    cfg_hooks->split_edge, the function tree_split_edge
1173    is actually called and, when calling cfg_hooks->duplicate_block,
1174    the function tree_duplicate_bb is called.  */
1175   gimple_register_cfg_hooks ();
1176
1177
1178   /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1179         Resulting CFG would be:
1180
1181         first_loop:
1182         do {
1183         } while ...
1184
1185         second_loop:
1186         do {
1187         } while ...
1188
1189         orig_exit_bb:
1190    */
1191
1192   if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1193     {
1194       loop_loc = find_loop_location (loop);
1195       if (dump_file && (dump_flags & TDF_DETAILS))
1196         {
1197           if (loop_loc != UNKNOWN_LOC)
1198             fprintf (dump_file, "\n%s:%d: note: ",
1199                      LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1200           fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1201         }
1202       return NULL;
1203     }
1204
1205   if (e == exit_e)
1206     {
1207       /* NEW_LOOP was placed after LOOP.  */
1208       first_loop = loop;
1209       second_loop = new_loop;
1210     }
1211   else
1212     {
1213       /* NEW_LOOP was placed before LOOP.  */
1214       first_loop = new_loop;
1215       second_loop = loop;
1216     }
1217
1218   definitions = ssa_names_to_replace ();
1219   slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1220   rename_variables_in_loop (new_loop);
1221
1222
1223   /* 2.  Add the guard code in one of the following ways:
1224
1225      2.a Add the guard that controls whether the first loop is executed.
1226          This occurs when this function is invoked for prologue or epilogue
1227          generation and when the cost model check can be done at compile time.
1228
1229          Resulting CFG would be:
1230
1231          bb_before_first_loop:
1232          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1233                                 GOTO first-loop
1234
1235          first_loop:
1236          do {
1237          } while ...
1238
1239          bb_before_second_loop:
1240
1241          second_loop:
1242          do {
1243          } while ...
1244
1245          orig_exit_bb:
1246
1247      2.b Add the cost model check that allows the prologue
1248          to iterate for the entire unchanged scalar
1249          iterations of the loop in the event that the cost
1250          model indicates that the scalar loop is more
1251          profitable than the vector one. This occurs when
1252          this function is invoked for prologue generation
1253          and the cost model check needs to be done at run
1254          time.
1255
1256          Resulting CFG after prologue peeling would be:
1257
1258          if (scalar_loop_iterations <= th)
1259            FIRST_NITERS = scalar_loop_iterations
1260
1261          bb_before_first_loop:
1262          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1263                                 GOTO first-loop
1264
1265          first_loop:
1266          do {
1267          } while ...
1268
1269          bb_before_second_loop:
1270
1271          second_loop:
1272          do {
1273          } while ...
1274
1275          orig_exit_bb:
1276
1277      2.c Add the cost model check that allows the epilogue
1278          to iterate for the entire unchanged scalar
1279          iterations of the loop in the event that the cost
1280          model indicates that the scalar loop is more
1281          profitable than the vector one. This occurs when
1282          this function is invoked for epilogue generation
1283          and the cost model check needs to be done at run
1284          time.
1285
1286          Resulting CFG after prologue peeling would be:
1287
1288          bb_before_first_loop:
1289          if ((scalar_loop_iterations <= th)
1290              ||
1291              FIRST_NITERS == 0) GOTO bb_before_second_loop
1292                                 GOTO first-loop
1293
1294          first_loop:
1295          do {
1296          } while ...
1297
1298          bb_before_second_loop:
1299
1300          second_loop:
1301          do {
1302          } while ...
1303
1304          orig_exit_bb:
1305   */
1306
1307   bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1308   bb_before_second_loop = split_edge (single_exit (first_loop));
1309
1310   /* Epilogue peeling.  */
1311   if (!update_first_loop_count)
1312     {
1313       pre_condition =
1314         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1315                      build_int_cst (TREE_TYPE (first_niters), 0));
1316       if (check_profitability)
1317         {
1318           tree scalar_loop_iters
1319             = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1320                                         (loop_vec_info_for_loop (loop)));
1321           cost_pre_condition =
1322             build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1323                     build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1324
1325           pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1326                                        cost_pre_condition, pre_condition);
1327         }
1328     }
1329
1330   /* Prologue peeling.  */
1331   else
1332     {
1333       if (check_profitability)
1334         set_prologue_iterations (bb_before_first_loop, first_niters,
1335                                  loop, th);
1336
1337       pre_condition =
1338         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1339                      build_int_cst (TREE_TYPE (first_niters), 0));
1340     }
1341
1342   skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1343                                   bb_before_second_loop, bb_before_first_loop);
1344   slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1345                                       first_loop == new_loop,
1346                                       &new_exit_bb, &definitions);
1347
1348
1349   /* 3. Add the guard that controls whether the second loop is executed.
1350         Resulting CFG would be:
1351
1352         bb_before_first_loop:
1353         if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1354                                GOTO first-loop
1355
1356         first_loop:
1357         do {
1358         } while ...
1359
1360         bb_between_loops:
1361         if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1362                                     GOTO bb_before_second_loop
1363
1364         bb_before_second_loop:
1365
1366         second_loop:
1367         do {
1368         } while ...
1369
1370         bb_after_second_loop:
1371
1372         orig_exit_bb:
1373    */
1374
1375   bb_between_loops = new_exit_bb;
1376   bb_after_second_loop = split_edge (single_exit (second_loop));
1377
1378   pre_condition =
1379         fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1380   skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1381                                   bb_after_second_loop, bb_before_first_loop);
1382   slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1383                                      second_loop == new_loop, &new_exit_bb);
1384
1385   /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1386    */
1387   if (update_first_loop_count)
1388     slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1389
1390   BITMAP_FREE (definitions);
1391   delete_update_ssa ();
1392
1393   return new_loop;
1394 }
1395
1396 /* Function vect_get_loop_location.
1397
1398    Extract the location of the loop in the source code.
1399    If the loop is not well formed for vectorization, an estimated
1400    location is calculated.
1401    Return the loop location if succeed and NULL if not.  */
1402
1403 LOC
1404 find_loop_location (struct loop *loop)
1405 {
1406   gimple stmt = NULL;
1407   basic_block bb;
1408   gimple_stmt_iterator si;
1409
1410   if (!loop)
1411     return UNKNOWN_LOC;
1412
1413   stmt = get_loop_exit_condition (loop);
1414
1415   if (stmt && gimple_location (stmt) != UNKNOWN_LOC)
1416     return gimple_location (stmt);
1417
1418   /* If we got here the loop is probably not "well formed",
1419      try to estimate the loop location */
1420
1421   if (!loop->header)
1422     return UNKNOWN_LOC;
1423
1424   bb = loop->header;
1425
1426   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1427     {
1428       stmt = gsi_stmt (si);
1429       if (gimple_location (stmt) != UNKNOWN_LOC)
1430         return gimple_location (stmt);
1431     }
1432
1433   return UNKNOWN_LOC;
1434 }
1435
1436
1437 /*************************************************************************
1438   Vectorization Debug Information.
1439  *************************************************************************/
1440
1441 /* Function vect_set_verbosity_level.
1442
1443    Called from toplev.c upon detection of the
1444    -ftree-vectorizer-verbose=N option.  */
1445
1446 void
1447 vect_set_verbosity_level (const char *val)
1448 {
1449    unsigned int vl;
1450
1451    vl = atoi (val);
1452    if (vl < MAX_VERBOSITY_LEVEL)
1453      vect_verbosity_level = vl;
1454    else
1455      vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1456 }
1457
1458
1459 /* Function vect_set_dump_settings.
1460
1461    Fix the verbosity level of the vectorizer if the
1462    requested level was not set explicitly using the flag
1463    -ftree-vectorizer-verbose=N.
1464    Decide where to print the debugging information (dump_file/stderr).
1465    If the user defined the verbosity level, but there is no dump file,
1466    print to stderr, otherwise print to the dump file.  */
1467
1468 static void
1469 vect_set_dump_settings (void)
1470 {
1471   vect_dump = dump_file;
1472
1473   /* Check if the verbosity level was defined by the user:  */
1474   if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1475     {
1476       /* If there is no dump file, print to stderr.  */
1477       if (!dump_file)
1478         vect_dump = stderr;
1479       return;
1480     }
1481
1482   /* User didn't specify verbosity level:  */
1483   if (dump_file && (dump_flags & TDF_DETAILS))
1484     vect_verbosity_level = REPORT_DETAILS;
1485   else if (dump_file && (dump_flags & TDF_STATS))
1486     vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1487   else
1488     vect_verbosity_level = REPORT_NONE;
1489
1490   gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1491 }
1492
1493
1494 /* Function debug_loop_details.
1495
1496    For vectorization debug dumps.  */
1497
1498 bool
1499 vect_print_dump_info (enum verbosity_levels vl)
1500 {
1501   if (vl > vect_verbosity_level)
1502     return false;
1503
1504   if (!current_function_decl || !vect_dump)
1505     return false;
1506
1507   if (vect_loop_location == UNKNOWN_LOC)
1508     fprintf (vect_dump, "\n%s:%d: note: ",
1509              DECL_SOURCE_FILE (current_function_decl),
1510              DECL_SOURCE_LINE (current_function_decl));
1511   else
1512     fprintf (vect_dump, "\n%s:%d: note: ",
1513              LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1514
1515   return true;
1516 }
1517
1518
1519 /*************************************************************************
1520   Vectorization Utilities.
1521  *************************************************************************/
1522
1523 /* Function new_stmt_vec_info.
1524
1525    Create and initialize a new stmt_vec_info struct for STMT.  */
1526
1527 stmt_vec_info
1528 new_stmt_vec_info (gimple stmt, loop_vec_info loop_vinfo)
1529 {
1530   stmt_vec_info res;
1531   res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1532
1533   STMT_VINFO_TYPE (res) = undef_vec_info_type;
1534   STMT_VINFO_STMT (res) = stmt;
1535   STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1536   STMT_VINFO_RELEVANT (res) = 0;
1537   STMT_VINFO_LIVE_P (res) = false;
1538   STMT_VINFO_VECTYPE (res) = NULL;
1539   STMT_VINFO_VEC_STMT (res) = NULL;
1540   STMT_VINFO_IN_PATTERN_P (res) = false;
1541   STMT_VINFO_RELATED_STMT (res) = NULL;
1542   STMT_VINFO_DATA_REF (res) = NULL;
1543
1544   STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1545   STMT_VINFO_DR_OFFSET (res) = NULL;
1546   STMT_VINFO_DR_INIT (res) = NULL;
1547   STMT_VINFO_DR_STEP (res) = NULL;
1548   STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1549
1550   if (gimple_code (stmt) == GIMPLE_PHI
1551       && is_loop_header_bb_p (gimple_bb (stmt)))
1552     STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1553   else
1554     STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1555   STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1556   STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1557   STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1558   STMT_SLP_TYPE (res) = 0;
1559   DR_GROUP_FIRST_DR (res) = NULL;
1560   DR_GROUP_NEXT_DR (res) = NULL;
1561   DR_GROUP_SIZE (res) = 0;
1562   DR_GROUP_STORE_COUNT (res) = 0;
1563   DR_GROUP_GAP (res) = 0;
1564   DR_GROUP_SAME_DR_STMT (res) = NULL;
1565   DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1566
1567   return res;
1568 }
1569
1570 /* Create a hash table for stmt_vec_info. */
1571
1572 void
1573 init_stmt_vec_info_vec (void)
1574 {
1575   gcc_assert (!stmt_vec_info_vec);
1576   stmt_vec_info_vec = VEC_alloc (vec_void_p, heap, 50);
1577 }
1578
1579 /* Free hash table for stmt_vec_info. */
1580
1581 void
1582 free_stmt_vec_info_vec (void)
1583 {
1584   gcc_assert (stmt_vec_info_vec);
1585   VEC_free (vec_void_p, heap, stmt_vec_info_vec);
1586 }
1587
1588 /* Free stmt vectorization related info.  */
1589
1590 void
1591 free_stmt_vec_info (gimple stmt)
1592 {
1593   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1594
1595   if (!stmt_info)
1596     return;
1597
1598   VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1599   set_vinfo_for_stmt (stmt, NULL);
1600   free (stmt_info);
1601 }
1602
1603
1604 /* Function bb_in_loop_p
1605
1606    Used as predicate for dfs order traversal of the loop bbs.  */
1607
1608 static bool
1609 bb_in_loop_p (const_basic_block bb, const void *data)
1610 {
1611   const struct loop *const loop = (const struct loop *)data;
1612   if (flow_bb_inside_loop_p (loop, bb))
1613     return true;
1614   return false;
1615 }
1616
1617
1618 /* Function new_loop_vec_info.
1619
1620    Create and initialize a new loop_vec_info struct for LOOP, as well as
1621    stmt_vec_info structs for all the stmts in LOOP.  */
1622
1623 loop_vec_info
1624 new_loop_vec_info (struct loop *loop)
1625 {
1626   loop_vec_info res;
1627   basic_block *bbs;
1628   gimple_stmt_iterator si;
1629   unsigned int i, nbbs;
1630
1631   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1632   LOOP_VINFO_LOOP (res) = loop;
1633
1634   bbs = get_loop_body (loop);
1635
1636   /* Create/Update stmt_info for all stmts in the loop.  */
1637   for (i = 0; i < loop->num_nodes; i++)
1638     {
1639       basic_block bb = bbs[i];
1640
1641       /* BBs in a nested inner-loop will have been already processed (because
1642          we will have called vect_analyze_loop_form for any nested inner-loop).
1643          Therefore, for stmts in an inner-loop we just want to update the
1644          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1645          loop_info of the outer-loop we are currently considering to vectorize
1646          (instead of the loop_info of the inner-loop).
1647          For stmts in other BBs we need to create a stmt_info from scratch.  */
1648       if (bb->loop_father != loop)
1649         {
1650           /* Inner-loop bb.  */
1651           gcc_assert (loop->inner && bb->loop_father == loop->inner);
1652           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1653             {
1654               gimple phi = gsi_stmt (si);
1655               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1656               loop_vec_info inner_loop_vinfo =
1657                 STMT_VINFO_LOOP_VINFO (stmt_info);
1658               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1659               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1660             }
1661           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1662            {
1663               gimple stmt = gsi_stmt (si);
1664               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1665               loop_vec_info inner_loop_vinfo =
1666                  STMT_VINFO_LOOP_VINFO (stmt_info);
1667               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1668               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1669            }
1670         }
1671       else
1672         {
1673           /* bb in current nest.  */
1674           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1675             {
1676               gimple phi = gsi_stmt (si);
1677               gimple_set_uid (phi, 0);
1678               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1679             }
1680
1681           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1682             {
1683               gimple stmt = gsi_stmt (si);
1684               gimple_set_uid (stmt, 0);
1685               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1686             }
1687         }
1688     }
1689
1690   /* CHECKME: We want to visit all BBs before their successors (except for
1691      latch blocks, for which this assertion wouldn't hold).  In the simple
1692      case of the loop forms we allow, a dfs order of the BBs would the same
1693      as reversed postorder traversal, so we are safe.  */
1694
1695    free (bbs);
1696    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1697    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1698                               bbs, loop->num_nodes, loop);
1699    gcc_assert (nbbs == loop->num_nodes);
1700
1701   LOOP_VINFO_BBS (res) = bbs;
1702   LOOP_VINFO_NITERS (res) = NULL;
1703   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1704   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1705   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1706   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1707   LOOP_VINFO_VECT_FACTOR (res) = 0;
1708   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1709   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1710   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1711   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1712     VEC_alloc (gimple, heap,
1713                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1714   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1715     VEC_alloc (ddr_p, heap,
1716                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1717   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
1718   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1719   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1720
1721   return res;
1722 }
1723
1724
1725 /* Function destroy_loop_vec_info.
1726
1727    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1728    stmts in the loop.  */
1729
1730 void
1731 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1732 {
1733   struct loop *loop;
1734   basic_block *bbs;
1735   int nbbs;
1736   gimple_stmt_iterator si;
1737   int j;
1738   VEC (slp_instance, heap) *slp_instances;
1739   slp_instance instance;
1740
1741   if (!loop_vinfo)
1742     return;
1743
1744   loop = LOOP_VINFO_LOOP (loop_vinfo);
1745
1746   bbs = LOOP_VINFO_BBS (loop_vinfo);
1747   nbbs = loop->num_nodes;
1748
1749   if (!clean_stmts)
1750     {
1751       free (LOOP_VINFO_BBS (loop_vinfo));
1752       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1753       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1754       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1755
1756       free (loop_vinfo);
1757       loop->aux = NULL;
1758       return;
1759     }
1760
1761   for (j = 0; j < nbbs; j++)
1762     {
1763       basic_block bb = bbs[j];
1764
1765       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1766         free_stmt_vec_info (gsi_stmt (si));
1767
1768       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1769         {
1770           gimple stmt = gsi_stmt (si);
1771           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1772
1773           if (stmt_info)
1774             {
1775               /* Check if this is a "pattern stmt" (introduced by the
1776                  vectorizer during the pattern recognition pass).  */
1777               bool remove_stmt_p = false;
1778               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1779               if (orig_stmt)
1780                 {
1781                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1782                   if (orig_stmt_info
1783                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1784                     remove_stmt_p = true;
1785                 }
1786
1787               /* Free stmt_vec_info.  */
1788               free_stmt_vec_info (stmt);
1789
1790               /* Remove dead "pattern stmts".  */
1791               if (remove_stmt_p)
1792                 gsi_remove (&si, true);
1793             }
1794           gsi_next (&si);
1795         }
1796     }
1797
1798   free (LOOP_VINFO_BBS (loop_vinfo));
1799   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1800   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1801   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1802   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1803   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1804   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1805     vect_free_slp_instance (instance);
1806
1807   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1808   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
1809
1810   free (loop_vinfo);
1811   loop->aux = NULL;
1812 }
1813
1814
1815 /* Function vect_force_dr_alignment_p.
1816
1817    Returns whether the alignment of a DECL can be forced to be aligned
1818    on ALIGNMENT bit boundary.  */
1819
1820 bool
1821 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1822 {
1823   if (TREE_CODE (decl) != VAR_DECL)
1824     return false;
1825
1826   if (DECL_EXTERNAL (decl))
1827     return false;
1828
1829   if (TREE_ASM_WRITTEN (decl))
1830     return false;
1831
1832   if (TREE_STATIC (decl))
1833     return (alignment <= MAX_OFILE_ALIGNMENT);
1834   else
1835     return (alignment <= MAX_STACK_ALIGNMENT);
1836 }
1837
1838
1839 /* Function get_vectype_for_scalar_type.
1840
1841    Returns the vector type corresponding to SCALAR_TYPE as supported
1842    by the target.  */
1843
1844 tree
1845 get_vectype_for_scalar_type (tree scalar_type)
1846 {
1847   enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1848   int nbytes = GET_MODE_SIZE (inner_mode);
1849   int nunits;
1850   tree vectype;
1851
1852   if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD (inner_mode))
1853     return NULL_TREE;
1854
1855   /* FORNOW: Only a single vector size per mode (UNITS_PER_SIMD_WORD)
1856      is expected.  */
1857   nunits = UNITS_PER_SIMD_WORD (inner_mode) / nbytes;
1858
1859   vectype = build_vector_type (scalar_type, nunits);
1860   if (vect_print_dump_info (REPORT_DETAILS))
1861     {
1862       fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1863       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1864     }
1865
1866   if (!vectype)
1867     return NULL_TREE;
1868
1869   if (vect_print_dump_info (REPORT_DETAILS))
1870     {
1871       fprintf (vect_dump, "vectype: ");
1872       print_generic_expr (vect_dump, vectype, TDF_SLIM);
1873     }
1874
1875   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1876       && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1877     {
1878       if (vect_print_dump_info (REPORT_DETAILS))
1879         fprintf (vect_dump, "mode not supported by target.");
1880       return NULL_TREE;
1881     }
1882
1883   return vectype;
1884 }
1885
1886
1887 /* Function vect_supportable_dr_alignment
1888
1889    Return whether the data reference DR is supported with respect to its
1890    alignment.  */
1891
1892 enum dr_alignment_support
1893 vect_supportable_dr_alignment (struct data_reference *dr)
1894 {
1895   gimple stmt = DR_STMT (dr);
1896   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1897   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1898   enum machine_mode mode = (int) TYPE_MODE (vectype);
1899   struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1900   bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1901   bool invariant_in_outerloop = false;
1902
1903   if (aligned_access_p (dr))
1904     return dr_aligned;
1905
1906   if (nested_in_vect_loop)
1907     {
1908       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1909       invariant_in_outerloop =
1910         (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1911     }
1912
1913   /* Possibly unaligned access.  */
1914
1915   /* We can choose between using the implicit realignment scheme (generating
1916      a misaligned_move stmt) and the explicit realignment scheme (generating
1917      aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1918      realignment scheme: optimized, and unoptimized.
1919      We can optimize the realignment only if the step between consecutive
1920      vector loads is equal to the vector size.  Since the vector memory
1921      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1922      is guaranteed that the misalignment amount remains the same throughout the
1923      execution of the vectorized loop.  Therefore, we can create the
1924      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1925      at the loop preheader.
1926
1927      However, in the case of outer-loop vectorization, when vectorizing a
1928      memory access in the inner-loop nested within the LOOP that is now being
1929      vectorized, while it is guaranteed that the misalignment of the
1930      vectorized memory access will remain the same in different outer-loop
1931      iterations, it is *not* guaranteed that is will remain the same throughout
1932      the execution of the inner-loop.  This is because the inner-loop advances
1933      with the original scalar step (and not in steps of VS).  If the inner-loop
1934      step happens to be a multiple of VS, then the misalignment remains fixed
1935      and we can use the optimized realignment scheme.  For example:
1936
1937       for (i=0; i<N; i++)
1938         for (j=0; j<M; j++)
1939           s += a[i+j];
1940
1941      When vectorizing the i-loop in the above example, the step between
1942      consecutive vector loads is 1, and so the misalignment does not remain
1943      fixed across the execution of the inner-loop, and the realignment cannot
1944      be optimized (as illustrated in the following pseudo vectorized loop):
1945
1946       for (i=0; i<N; i+=4)
1947         for (j=0; j<M; j++){
1948           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1949                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
1950                          // (assuming that we start from an aligned address).
1951           }
1952
1953      We therefore have to use the unoptimized realignment scheme:
1954
1955       for (i=0; i<N; i+=4)
1956           for (j=k; j<M; j+=4)
1957           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1958                            // that the misalignment of the initial address is
1959                            // 0).
1960
1961      The loop can then be vectorized as follows:
1962
1963       for (k=0; k<4; k++){
1964         rt = get_realignment_token (&vp[k]);
1965         for (i=0; i<N; i+=4){
1966           v1 = vp[i+k];
1967           for (j=k; j<M; j+=4){
1968             v2 = vp[i+j+VS-1];
1969             va = REALIGN_LOAD <v1,v2,rt>;
1970             vs += va;
1971             v1 = v2;
1972           }
1973         }
1974     } */
1975
1976   if (DR_IS_READ (dr))
1977     {
1978       if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1979                                                              CODE_FOR_nothing
1980           && (!targetm.vectorize.builtin_mask_for_load
1981               || targetm.vectorize.builtin_mask_for_load ()))
1982         {
1983           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1984           if (nested_in_vect_loop
1985               && (TREE_INT_CST_LOW (DR_STEP (dr))
1986                   != GET_MODE_SIZE (TYPE_MODE (vectype))))
1987             return dr_explicit_realign;
1988           else
1989             return dr_explicit_realign_optimized;
1990         }
1991
1992       if (optab_handler (movmisalign_optab, mode)->insn_code !=
1993                                                              CODE_FOR_nothing)
1994         /* Can't software pipeline the loads, but can at least do them.  */
1995         return dr_unaligned_supported;
1996     }
1997
1998   /* Unsupported.  */
1999   return dr_unaligned_unsupported;
2000 }
2001
2002
2003 /* Function vect_is_simple_use.
2004
2005    Input:
2006    LOOP - the loop that is being vectorized.
2007    OPERAND - operand of a stmt in LOOP.
2008    DEF - the defining stmt in case OPERAND is an SSA_NAME.
2009
2010    Returns whether a stmt with OPERAND can be vectorized.
2011    Supportable operands are constants, loop invariants, and operands that are
2012    defined by the current iteration of the loop. Unsupportable operands are
2013    those that are defined by a previous iteration of the loop (as is the case
2014    in reduction/induction computations).  */
2015
2016 bool
2017 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, gimple *def_stmt,
2018                     tree *def, enum vect_def_type *dt)
2019 {
2020   basic_block bb;
2021   stmt_vec_info stmt_vinfo;
2022   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2023
2024   *def_stmt = NULL;
2025   *def = NULL_TREE;
2026
2027   if (vect_print_dump_info (REPORT_DETAILS))
2028     {
2029       fprintf (vect_dump, "vect_is_simple_use: operand ");
2030       print_generic_expr (vect_dump, operand, TDF_SLIM);
2031     }
2032
2033   if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
2034     {
2035       *dt = vect_constant_def;
2036       return true;
2037     }
2038   if (is_gimple_min_invariant (operand))
2039     {
2040       *def = operand;
2041       *dt = vect_invariant_def;
2042       return true;
2043     }
2044
2045   if (TREE_CODE (operand) == PAREN_EXPR)
2046     {
2047       if (vect_print_dump_info (REPORT_DETAILS))
2048         fprintf (vect_dump, "non-associatable copy.");
2049       operand = TREE_OPERAND (operand, 0);
2050     }
2051   if (TREE_CODE (operand) != SSA_NAME)
2052     {
2053       if (vect_print_dump_info (REPORT_DETAILS))
2054         fprintf (vect_dump, "not ssa-name.");
2055       return false;
2056     }
2057
2058   *def_stmt = SSA_NAME_DEF_STMT (operand);
2059   if (*def_stmt == NULL)
2060     {
2061       if (vect_print_dump_info (REPORT_DETAILS))
2062         fprintf (vect_dump, "no def_stmt.");
2063       return false;
2064     }
2065
2066   if (vect_print_dump_info (REPORT_DETAILS))
2067     {
2068       fprintf (vect_dump, "def_stmt: ");
2069       print_gimple_stmt (vect_dump, *def_stmt, 0, TDF_SLIM);
2070     }
2071
2072   /* empty stmt is expected only in case of a function argument.
2073      (Otherwise - we expect a phi_node or a GIMPLE_ASSIGN).  */
2074   if (gimple_nop_p (*def_stmt))
2075     {
2076       *def = operand;
2077       *dt = vect_invariant_def;
2078       return true;
2079     }
2080
2081   bb = gimple_bb (*def_stmt);
2082   if (!flow_bb_inside_loop_p (loop, bb))
2083     *dt = vect_invariant_def;
2084   else
2085     {
2086       stmt_vinfo = vinfo_for_stmt (*def_stmt);
2087       *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2088     }
2089
2090   if (*dt == vect_unknown_def_type)
2091     {
2092       if (vect_print_dump_info (REPORT_DETAILS))
2093         fprintf (vect_dump, "Unsupported pattern.");
2094       return false;
2095     }
2096
2097   if (vect_print_dump_info (REPORT_DETAILS))
2098     fprintf (vect_dump, "type of def: %d.",*dt);
2099
2100   switch (gimple_code (*def_stmt))
2101     {
2102     case GIMPLE_PHI:
2103       *def = gimple_phi_result (*def_stmt);
2104       break;
2105
2106     case GIMPLE_ASSIGN:
2107       *def = gimple_assign_lhs (*def_stmt);
2108       break;
2109
2110     case GIMPLE_CALL:
2111       *def = gimple_call_lhs (*def_stmt);
2112       if (*def != NULL)
2113         break;
2114       /* FALLTHRU */
2115     default:
2116       if (vect_print_dump_info (REPORT_DETAILS))
2117         fprintf (vect_dump, "unsupported defining stmt: ");
2118       return false;
2119     }
2120
2121   return true;
2122 }
2123
2124
2125 /* Function supportable_widening_operation
2126
2127    Check whether an operation represented by the code CODE is a
2128    widening operation that is supported by the target platform in
2129    vector form (i.e., when operating on arguments of type VECTYPE).
2130
2131    Widening operations we currently support are NOP (CONVERT), FLOAT
2132    and WIDEN_MULT.  This function checks if these operations are supported
2133    by the target platform either directly (via vector tree-codes), or via
2134    target builtins.
2135
2136    Output:
2137    - CODE1 and CODE2 are codes of vector operations to be used when
2138    vectorizing the operation, if available.
2139    - DECL1 and DECL2 are decls of target builtin functions to be used
2140    when vectorizing the operation, if available. In this case,
2141    CODE1 and CODE2 are CALL_EXPR.
2142    - MULTI_STEP_CVT determines the number of required intermediate steps in
2143    case of multi-step conversion (like char->short->int - in that case
2144    MULTI_STEP_CVT will be 1).
2145    - INTERM_TYPES contains the intermediate type required to perform the
2146    widening operation (short in the above example).  */
2147
2148 bool
2149 supportable_widening_operation (enum tree_code code, gimple stmt, tree vectype,
2150                                 tree *decl1, tree *decl2,
2151                                 enum tree_code *code1, enum tree_code *code2,
2152                                 int *multi_step_cvt,
2153                                 VEC (tree, heap) **interm_types)
2154 {
2155   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2156   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2157   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2158   bool ordered_p;
2159   enum machine_mode vec_mode;
2160   enum insn_code icode1 = 0, icode2 = 0;
2161   optab optab1, optab2;
2162   tree type = gimple_expr_type (stmt);
2163   tree wide_vectype = get_vectype_for_scalar_type (type);
2164   enum tree_code c1, c2;
2165
2166   /* The result of a vectorized widening operation usually requires two vectors
2167      (because the widened results do not fit int one vector). The generated
2168      vector results would normally be expected to be generated in the same
2169      order as in the original scalar computation, i.e. if 8 results are
2170      generated in each vector iteration, they are to be organized as follows:
2171         vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2172
2173      However, in the special case that the result of the widening operation is
2174      used in a reduction computation only, the order doesn't matter (because
2175      when vectorizing a reduction we change the order of the computation).
2176      Some targets can take advantage of this and generate more efficient code.
2177      For example, targets like Altivec, that support widen_mult using a sequence
2178      of {mult_even,mult_odd} generate the following vectors:
2179         vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2180
2181      When vectorizing outer-loops, we execute the inner-loop sequentially
2182      (each vectorized inner-loop iteration contributes to VF outer-loop
2183      iterations in parallel). We therefore don't allow to change the order
2184      of the computation in the inner-loop during outer-loop vectorization.  */
2185
2186    if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2187        && !nested_in_vect_loop_p (vect_loop, stmt))
2188      ordered_p = false;
2189    else
2190      ordered_p = true;
2191
2192   if (!ordered_p
2193       && code == WIDEN_MULT_EXPR
2194       && targetm.vectorize.builtin_mul_widen_even
2195       && targetm.vectorize.builtin_mul_widen_even (vectype)
2196       && targetm.vectorize.builtin_mul_widen_odd
2197       && targetm.vectorize.builtin_mul_widen_odd (vectype))
2198     {
2199       if (vect_print_dump_info (REPORT_DETAILS))
2200         fprintf (vect_dump, "Unordered widening operation detected.");
2201
2202       *code1 = *code2 = CALL_EXPR;
2203       *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2204       *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2205       return true;
2206     }
2207
2208   switch (code)
2209     {
2210     case WIDEN_MULT_EXPR:
2211       if (BYTES_BIG_ENDIAN)
2212         {
2213           c1 = VEC_WIDEN_MULT_HI_EXPR;
2214           c2 = VEC_WIDEN_MULT_LO_EXPR;
2215         }
2216       else
2217         {
2218           c2 = VEC_WIDEN_MULT_HI_EXPR;
2219           c1 = VEC_WIDEN_MULT_LO_EXPR;
2220         }
2221       break;
2222
2223     CASE_CONVERT:
2224       if (BYTES_BIG_ENDIAN)
2225         {
2226           c1 = VEC_UNPACK_HI_EXPR;
2227           c2 = VEC_UNPACK_LO_EXPR;
2228         }
2229       else
2230         {
2231           c2 = VEC_UNPACK_HI_EXPR;
2232           c1 = VEC_UNPACK_LO_EXPR;
2233         }
2234       break;
2235
2236     case FLOAT_EXPR:
2237       if (BYTES_BIG_ENDIAN)
2238         {
2239           c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2240           c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2241         }
2242       else
2243         {
2244           c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2245           c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2246         }
2247       break;
2248
2249     case FIX_TRUNC_EXPR:
2250       /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2251          VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2252          computing the operation.  */
2253       return false;
2254
2255     default:
2256       gcc_unreachable ();
2257     }
2258
2259   if (code == FIX_TRUNC_EXPR)
2260     {
2261       /* The signedness is determined from output operand.  */
2262       optab1 = optab_for_tree_code (c1, type, optab_default);
2263       optab2 = optab_for_tree_code (c2, type, optab_default);
2264     }
2265   else
2266     {
2267       optab1 = optab_for_tree_code (c1, vectype, optab_default);
2268       optab2 = optab_for_tree_code (c2, vectype, optab_default);
2269     }
2270
2271   if (!optab1 || !optab2)
2272     return false;
2273
2274   vec_mode = TYPE_MODE (vectype);
2275   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2276        || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2277                                                        == CODE_FOR_nothing)
2278     return false;
2279
2280   /* Check if it's a multi-step conversion that can be done using intermediate
2281      types.  */
2282   if (insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2283        || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2284     {
2285       int i;
2286       tree prev_type = vectype, intermediate_type;
2287       enum machine_mode intermediate_mode, prev_mode = vec_mode;
2288       optab optab3, optab4;
2289
2290       if (!CONVERT_EXPR_CODE_P (code))
2291         return false;
2292
2293       *code1 = c1;
2294       *code2 = c2;
2295
2296       /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
2297          intermediate  steps in promotion sequence. We try MAX_INTERM_CVT_STEPS
2298          to get to NARROW_VECTYPE, and fail if we do not.  */
2299       *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
2300       for (i = 0; i < 3; i++)
2301         {
2302           intermediate_mode = insn_data[icode1].operand[0].mode;
2303           intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
2304                                                      TYPE_UNSIGNED (prev_type));
2305           optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
2306           optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
2307
2308           if (!optab3 || !optab4
2309               || (icode1 = optab1->handlers[(int) prev_mode].insn_code)
2310                                                         == CODE_FOR_nothing
2311               || insn_data[icode1].operand[0].mode != intermediate_mode
2312               || (icode2 = optab2->handlers[(int) prev_mode].insn_code)
2313                                                         == CODE_FOR_nothing
2314               || insn_data[icode2].operand[0].mode != intermediate_mode
2315               || (icode1 = optab3->handlers[(int) intermediate_mode].insn_code)
2316                                                         == CODE_FOR_nothing
2317               || (icode2 = optab4->handlers[(int) intermediate_mode].insn_code)
2318                                                         == CODE_FOR_nothing)
2319             return false;
2320
2321           VEC_quick_push (tree, *interm_types, intermediate_type);
2322           (*multi_step_cvt)++;
2323
2324           if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
2325               && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
2326             return true;
2327
2328           prev_type = intermediate_type;
2329           prev_mode = intermediate_mode;
2330         }
2331
2332        return false;
2333     }
2334
2335   *code1 = c1;
2336   *code2 = c2;
2337   return true;
2338 }
2339
2340
2341 /* Function supportable_narrowing_operation
2342
2343    Check whether an operation represented by the code CODE is a
2344    narrowing operation that is supported by the target platform in
2345    vector form (i.e., when operating on arguments of type VECTYPE).
2346
2347    Narrowing operations we currently support are NOP (CONVERT) and
2348    FIX_TRUNC. This function checks if these operations are supported by
2349    the target platform directly via vector tree-codes.
2350
2351    Output:
2352    - CODE1 is the code of a vector operation to be used when
2353    vectorizing the operation, if available.
2354    - MULTI_STEP_CVT determines the number of required intermediate steps in
2355    case of multi-step conversion (like int->short->char - in that case
2356    MULTI_STEP_CVT will be 1).
2357    - INTERM_TYPES contains the intermediate type required to perform the
2358    narrowing operation (short in the above example).   */
2359
2360 bool
2361 supportable_narrowing_operation (enum tree_code code,
2362                                  const_gimple stmt, tree vectype,
2363                                  enum tree_code *code1, int *multi_step_cvt,
2364                                  VEC (tree, heap) **interm_types)
2365 {
2366   enum machine_mode vec_mode;
2367   enum insn_code icode1;
2368   optab optab1, interm_optab;
2369   tree type = gimple_expr_type (stmt);
2370   tree narrow_vectype = get_vectype_for_scalar_type (type);
2371   enum tree_code c1;
2372   tree intermediate_type, prev_type;
2373   int i;
2374
2375   switch (code)
2376     {
2377     CASE_CONVERT:
2378       c1 = VEC_PACK_TRUNC_EXPR;
2379       break;
2380
2381     case FIX_TRUNC_EXPR:
2382       c1 = VEC_PACK_FIX_TRUNC_EXPR;
2383       break;
2384
2385     case FLOAT_EXPR:
2386       /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2387          tree code and optabs used for computing the operation.  */
2388       return false;
2389
2390     default:
2391       gcc_unreachable ();
2392     }
2393
2394   if (code == FIX_TRUNC_EXPR)
2395     /* The signedness is determined from output operand.  */
2396     optab1 = optab_for_tree_code (c1, type, optab_default);
2397   else
2398     optab1 = optab_for_tree_code (c1, vectype, optab_default);
2399
2400   if (!optab1)
2401     return false;
2402
2403   vec_mode = TYPE_MODE (vectype);
2404   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code)
2405        == CODE_FOR_nothing)
2406     return false;
2407
2408   /* Check if it's a multi-step conversion that can be done using intermediate
2409      types.  */
2410   if (insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2411     {
2412       enum machine_mode intermediate_mode, prev_mode = vec_mode;
2413
2414       *code1 = c1;
2415       prev_type = vectype;
2416       /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
2417          intermediate  steps in promotion sequence. We try MAX_INTERM_CVT_STEPS
2418          to get to NARROW_VECTYPE, and fail if we do not.  */
2419       *interm_types = VEC_alloc (tree, heap, MAX_INTERM_CVT_STEPS);
2420       for (i = 0; i < 3; i++)
2421         {
2422           intermediate_mode = insn_data[icode1].operand[0].mode;
2423           intermediate_type = lang_hooks.types.type_for_mode (intermediate_mode,
2424                                                      TYPE_UNSIGNED (prev_type));
2425           interm_optab = optab_for_tree_code (c1, intermediate_type,
2426                                               optab_default);
2427           if (!interm_optab
2428               || (icode1 = optab1->handlers[(int) prev_mode].insn_code)
2429                                                         == CODE_FOR_nothing
2430               || insn_data[icode1].operand[0].mode != intermediate_mode
2431               || (icode1
2432                   = interm_optab->handlers[(int) intermediate_mode].insn_code)
2433                  == CODE_FOR_nothing)
2434             return false;
2435
2436           VEC_quick_push (tree, *interm_types, intermediate_type);
2437           (*multi_step_cvt)++;
2438
2439           if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
2440             return true;
2441
2442           prev_type = intermediate_type;
2443           prev_mode = intermediate_mode;
2444         }
2445
2446       return false;
2447     }
2448
2449   *code1 = c1;
2450   return true;
2451 }
2452
2453
2454 /* Function reduction_code_for_scalar_code
2455
2456    Input:
2457    CODE - tree_code of a reduction operations.
2458
2459    Output:
2460    REDUC_CODE - the corresponding tree-code to be used to reduce the
2461       vector of partial results into a single scalar result (which
2462       will also reside in a vector).
2463
2464    Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
2465
2466 bool
2467 reduction_code_for_scalar_code (enum tree_code code,
2468                                 enum tree_code *reduc_code)
2469 {
2470   switch (code)
2471   {
2472   case MAX_EXPR:
2473     *reduc_code = REDUC_MAX_EXPR;
2474     return true;
2475
2476   case MIN_EXPR:
2477     *reduc_code = REDUC_MIN_EXPR;
2478     return true;
2479
2480   case PLUS_EXPR:
2481     *reduc_code = REDUC_PLUS_EXPR;
2482     return true;
2483
2484   default:
2485     return false;
2486   }
2487 }
2488
2489 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2490    STMT is printed with a message MSG. */
2491
2492 static void
2493 report_vect_op (gimple stmt, const char *msg)
2494 {
2495   fprintf (vect_dump, "%s", msg);
2496   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
2497 }
2498
2499 /* Function vect_is_simple_reduction
2500
2501    Detect a cross-iteration def-use cycle that represents a simple
2502    reduction computation. We look for the following pattern:
2503
2504    loop_header:
2505      a1 = phi < a0, a2 >
2506      a3 = ...
2507      a2 = operation (a3, a1)
2508
2509    such that:
2510    1. operation is commutative and associative and it is safe to
2511       change the order of the computation.
2512    2. no uses for a2 in the loop (a2 is used out of the loop)
2513    3. no uses of a1 in the loop besides the reduction operation.
2514
2515    Condition 1 is tested here.
2516    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
2517
2518 gimple
2519 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi)
2520 {
2521   struct loop *loop = (gimple_bb (phi))->loop_father;
2522   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2523   edge latch_e = loop_latch_edge (loop);
2524   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2525   gimple def_stmt, def1, def2;
2526   enum tree_code code;
2527   tree op1, op2;
2528   tree type;
2529   int nloop_uses;
2530   tree name;
2531   imm_use_iterator imm_iter;
2532   use_operand_p use_p;
2533
2534   gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2535
2536   name = PHI_RESULT (phi);
2537   nloop_uses = 0;
2538   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2539     {
2540       gimple use_stmt = USE_STMT (use_p);
2541       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2542           && vinfo_for_stmt (use_stmt)
2543           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2544         nloop_uses++;
2545       if (nloop_uses > 1)
2546         {
2547           if (vect_print_dump_info (REPORT_DETAILS))
2548             fprintf (vect_dump, "reduction used in loop.");
2549           return NULL;
2550         }
2551     }
2552
2553   if (TREE_CODE (loop_arg) != SSA_NAME)
2554     {
2555       if (vect_print_dump_info (REPORT_DETAILS))
2556         {
2557           fprintf (vect_dump, "reduction: not ssa_name: ");
2558           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2559         }
2560       return NULL;
2561     }
2562
2563   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2564   if (!def_stmt)
2565     {
2566       if (vect_print_dump_info (REPORT_DETAILS))
2567         fprintf (vect_dump, "reduction: no def_stmt.");
2568       return NULL;
2569     }
2570
2571   if (!is_gimple_assign (def_stmt))
2572     {
2573       if (vect_print_dump_info (REPORT_DETAILS))
2574         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2575       return NULL;
2576     }
2577
2578   name = gimple_assign_lhs (def_stmt);
2579   nloop_uses = 0;
2580   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2581     {
2582       gimple use_stmt = USE_STMT (use_p);
2583       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2584           && vinfo_for_stmt (use_stmt)
2585           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2586         nloop_uses++;
2587       if (nloop_uses > 1)
2588         {
2589           if (vect_print_dump_info (REPORT_DETAILS))
2590             fprintf (vect_dump, "reduction used in loop.");
2591           return NULL;
2592         }
2593     }
2594
2595   code = gimple_assign_rhs_code (def_stmt);
2596
2597   if (!commutative_tree_code (code) || !associative_tree_code (code))
2598     {
2599       if (vect_print_dump_info (REPORT_DETAILS))
2600         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2601       return NULL;
2602     }
2603
2604   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2605     {
2606       if (vect_print_dump_info (REPORT_DETAILS))
2607         report_vect_op (def_stmt, "reduction: not binary operation: ");
2608       return NULL;
2609     }
2610
2611   op1 = gimple_assign_rhs1 (def_stmt);
2612   op2 = gimple_assign_rhs2 (def_stmt);
2613   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2614     {
2615       if (vect_print_dump_info (REPORT_DETAILS))
2616         report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2617       return NULL;
2618     }
2619
2620   /* Check that it's ok to change the order of the computation.  */
2621   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2622   if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2623       || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2624     {
2625       if (vect_print_dump_info (REPORT_DETAILS))
2626         {
2627           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2628           print_generic_expr (vect_dump, type, TDF_SLIM);
2629           fprintf (vect_dump, ", operands types: ");
2630           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2631           fprintf (vect_dump, ",");
2632           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2633         }
2634       return NULL;
2635     }
2636
2637   /* Generally, when vectorizing a reduction we change the order of the
2638      computation.  This may change the behavior of the program in some
2639      cases, so we need to check that this is ok.  One exception is when
2640      vectorizing an outer-loop: the inner-loop is executed sequentially,
2641      and therefore vectorizing reductions in the inner-loop during
2642      outer-loop vectorization is safe.  */
2643
2644   /* CHECKME: check for !flag_finite_math_only too?  */
2645   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2646       && !nested_in_vect_loop_p (vect_loop, def_stmt))
2647     {
2648       /* Changing the order of operations changes the semantics.  */
2649       if (vect_print_dump_info (REPORT_DETAILS))
2650         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2651       return NULL;
2652     }
2653   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2654            && !nested_in_vect_loop_p (vect_loop, def_stmt))
2655     {
2656       /* Changing the order of operations changes the semantics.  */
2657       if (vect_print_dump_info (REPORT_DETAILS))
2658         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2659       return NULL;
2660     }
2661   else if (SAT_FIXED_POINT_TYPE_P (type))
2662     {
2663       /* Changing the order of operations changes the semantics.  */
2664       if (vect_print_dump_info (REPORT_DETAILS))
2665         report_vect_op (def_stmt,
2666                         "reduction: unsafe fixed-point math optimization: ");
2667       return NULL;
2668     }
2669
2670   /* reduction is safe. we're dealing with one of the following:
2671      1) integer arithmetic and no trapv
2672      2) floating point arithmetic, and special flags permit this optimization.
2673    */
2674   def1 = SSA_NAME_DEF_STMT (op1);
2675   def2 = SSA_NAME_DEF_STMT (op2);
2676   if (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))
2677     {
2678       if (vect_print_dump_info (REPORT_DETAILS))
2679         report_vect_op (def_stmt, "reduction: no defs for operands: ");
2680       return NULL;
2681     }
2682
2683
2684   /* Check that one def is the reduction def, defined by PHI,
2685      the other def is either defined in the loop ("vect_loop_def"),
2686      or it's an induction (defined by a loop-header phi-node).  */
2687
2688   if (def2 == phi
2689       && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2690       && (is_gimple_assign (def1)
2691           || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2692           || (gimple_code (def1) == GIMPLE_PHI
2693               && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2694               && !is_loop_header_bb_p (gimple_bb (def1)))))
2695     {
2696       if (vect_print_dump_info (REPORT_DETAILS))
2697         report_vect_op (def_stmt, "detected reduction:");
2698       return def_stmt;
2699     }
2700   else if (def1 == phi
2701            && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2702            && (is_gimple_assign (def2)
2703                || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2704                || (gimple_code (def2) == GIMPLE_PHI
2705                    && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2706                    && !is_loop_header_bb_p (gimple_bb (def2)))))
2707     {
2708       /* Swap operands (just for simplicity - so that the rest of the code
2709          can assume that the reduction variable is always the last (second)
2710          argument).  */
2711       if (vect_print_dump_info (REPORT_DETAILS))
2712         report_vect_op (def_stmt ,
2713                         "detected reduction: need to swap operands:");
2714       swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2715                           gimple_assign_rhs2_ptr (def_stmt));
2716       return def_stmt;
2717     }
2718   else
2719     {
2720       if (vect_print_dump_info (REPORT_DETAILS))
2721         report_vect_op (def_stmt, "reduction: unknown pattern.");
2722       return NULL;
2723     }
2724 }
2725
2726
2727 /* Function vect_is_simple_iv_evolution.
2728
2729    FORNOW: A simple evolution of an induction variables in the loop is
2730    considered a polynomial evolution with constant step.  */
2731
2732 bool
2733 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2734                              tree * step)
2735 {
2736   tree init_expr;
2737   tree step_expr;
2738   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2739
2740   /* When there is no evolution in this loop, the evolution function
2741      is not "simple".  */
2742   if (evolution_part == NULL_TREE)
2743     return false;
2744
2745   /* When the evolution is a polynomial of degree >= 2
2746      the evolution function is not "simple".  */
2747   if (tree_is_chrec (evolution_part))
2748     return false;
2749
2750   step_expr = evolution_part;
2751   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2752
2753   if (vect_print_dump_info (REPORT_DETAILS))
2754     {
2755       fprintf (vect_dump, "step: ");
2756       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2757       fprintf (vect_dump, ",  init: ");
2758       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2759     }
2760
2761   *init = init_expr;
2762   *step = step_expr;
2763
2764   if (TREE_CODE (step_expr) != INTEGER_CST)
2765     {
2766       if (vect_print_dump_info (REPORT_DETAILS))
2767         fprintf (vect_dump, "step unknown.");
2768       return false;
2769     }
2770
2771   return true;
2772 }
2773
2774
2775 /* Function vectorize_loops.
2776
2777    Entry Point to loop vectorization phase.  */
2778
2779 unsigned
2780 vectorize_loops (void)
2781 {
2782   unsigned int i;
2783   unsigned int num_vectorized_loops = 0;
2784   unsigned int vect_loops_num;
2785   loop_iterator li;
2786   struct loop *loop;
2787
2788   vect_loops_num = number_of_loops ();
2789
2790   /* Bail out if there are no loops.  */
2791   if (vect_loops_num <= 1)
2792     return 0;
2793
2794   /* Fix the verbosity level if not defined explicitly by the user.  */
2795   vect_set_dump_settings ();
2796
2797   /* Allocate the bitmap that records which virtual variables that
2798      need to be renamed.  */
2799   vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2800
2801   init_stmt_vec_info_vec ();
2802
2803   /*  ----------- Analyze loops. -----------  */
2804
2805   /* If some loop was duplicated, it gets bigger number
2806      than all previously defined loops. This fact allows us to run
2807      only over initial loops skipping newly generated ones.  */
2808   FOR_EACH_LOOP (li, loop, 0)
2809     if (optimize_loop_nest_for_speed_p (loop))
2810       {
2811         loop_vec_info loop_vinfo;
2812
2813         vect_loop_location = find_loop_location (loop);
2814         loop_vinfo = vect_analyze_loop (loop);
2815         loop->aux = loop_vinfo;
2816
2817         if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2818           continue;
2819
2820         vect_transform_loop (loop_vinfo);
2821         num_vectorized_loops++;
2822       }
2823   vect_loop_location = UNKNOWN_LOC;
2824
2825   statistics_counter_event (cfun, "Vectorized loops", num_vectorized_loops);
2826   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2827       || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2828           && num_vectorized_loops > 0))
2829     fprintf (vect_dump, "vectorized %u loops in function.\n",
2830              num_vectorized_loops);
2831
2832   /*  ----------- Finalize. -----------  */
2833
2834   BITMAP_FREE (vect_memsyms_to_rename);
2835
2836   for (i = 1; i < vect_loops_num; i++)
2837     {
2838       loop_vec_info loop_vinfo;
2839
2840       loop = get_loop (i);
2841       if (!loop)
2842         continue;
2843       loop_vinfo = (loop_vec_info) loop->aux;
2844       destroy_loop_vec_info (loop_vinfo, true);
2845       loop->aux = NULL;
2846     }
2847
2848   free_stmt_vec_info_vec ();
2849
2850   return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2851 }
2852
2853 /* Increase alignment of global arrays to improve vectorization potential.
2854    TODO:
2855    - Consider also structs that have an array field.
2856    - Use ipa analysis to prune arrays that can't be vectorized?
2857      This should involve global alignment analysis and in the future also
2858      array padding.  */
2859
2860 static unsigned int
2861 increase_alignment (void)
2862 {
2863   struct varpool_node *vnode;
2864
2865   /* Increase the alignment of all global arrays for vectorization.  */
2866   for (vnode = varpool_nodes_queue;
2867        vnode;
2868        vnode = vnode->next_needed)
2869     {
2870       tree vectype, decl = vnode->decl;
2871       unsigned int alignment;
2872
2873       if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2874         continue;
2875       vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2876       if (!vectype)
2877         continue;
2878       alignment = TYPE_ALIGN (vectype);
2879       if (DECL_ALIGN (decl) >= alignment)
2880         continue;
2881
2882       if (vect_can_force_dr_alignment_p (decl, alignment))
2883         {
2884           DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2885           DECL_USER_ALIGN (decl) = 1;
2886           if (dump_file)
2887             {
2888               fprintf (dump_file, "Increasing alignment of decl: ");
2889               print_generic_expr (dump_file, decl, TDF_SLIM);
2890             }
2891         }
2892     }
2893   return 0;
2894 }
2895
2896 static bool
2897 gate_increase_alignment (void)
2898 {
2899   return flag_section_anchors && flag_tree_vectorize;
2900 }
2901
2902 struct simple_ipa_opt_pass pass_ipa_increase_alignment =
2903 {
2904  {
2905   SIMPLE_IPA_PASS,
2906   "increase_alignment",                 /* name */
2907   gate_increase_alignment,              /* gate */
2908   increase_alignment,                   /* execute */
2909   NULL,                                 /* sub */
2910   NULL,                                 /* next */
2911   0,                                    /* static_pass_number */
2912   0,                                    /* tv_id */
2913   0,                                    /* properties_required */
2914   0,                                    /* properties_provided */
2915   0,                                    /* properties_destroyed */
2916   0,                                    /* todo_flags_start */
2917   0                                     /* todo_flags_finish */
2918  }
2919 };