gcc/tree-vectorizer.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Loop Vectorization Pass.
  22
  23    This pass tries to vectorize loops. This first implementation focuses on
  24    simple inner-most loops, with no conditional control flow, and a set of
  25    simple operations which vector form can be expressed using existing
  26    tree codes (PLUS, MULT etc).
  27
  28    For example, the vectorizer transforms the following simple loop:
  29
  30         short a[N]; short b[N]; short c[N]; int i;
  31
  32         for (i=0; i<N; i++){
  33           a[i] = b[i] + c[i];
  34         }
  35
  36    as if it was manually vectorized by rewriting the source code into:
  37
  38         typedef int __attribute__((mode(V8HI))) v8hi;
  39         short a[N];  short b[N]; short c[N];   int i;
  40         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  41         v8hi va, vb, vc;
  42
  43         for (i=0; i<N/8; i++){
  44           vb = pb[i];
  45           vc = pc[i];
  46           va = vb + vc;
  47           pa[i] = va;
  48         }
  49
  50         The main entry to this pass is vectorize_loops(), in which
  51    the vectorizer applies a set of analyses on a given set of loops,
  52    followed by the actual vectorization transformation for the loops that
  53    had successfully passed the analysis phase.
  54
  55         Throughout this pass we make a distinction between two types of
  56    data: scalars (which are represented by SSA_NAMES), and memory references
  57    ("data-refs"). These two types of data require different handling both
  58    during analysis and transformation. The types of data-refs that the
  59    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  60    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  61    accesses are required to have a  simple (consecutive) access pattern.
  62
  63    Analysis phase:
  64    ===============
  65         The driver for the analysis phase is vect_analyze_loop_nest().
  66    It applies a set of analyses, some of which rely on the scalar evolution
  67    analyzer (scev) developed by Sebastian Pop.
  68
  69         During the analysis phase the vectorizer records some information
  70    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  71    loop, as well as general information about the loop as a whole, which is
  72    recorded in a "loop_vec_info" struct attached to each loop.
  73
  74    Transformation phase:
  75    =====================
  76         The loop transformation phase scans all the stmts in the loop, and
  77    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  78    the loop that needs to be vectorized. It insert the vector code sequence
  79    just before the scalar stmt S, and records a pointer to the vector code
  80    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
  81    attached to S). This pointer will be used for the vectorization of following
  82    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
  83    otherwise, we rely on dead code elimination for removing it.
  84
  85         For example, say stmt S1 was vectorized into stmt VS1:
  86
  87    VS1: vb = px[i];
  88    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  89    S2:  a = b;
  90
  91    To vectorize stmt S2, the vectorizer first finds the stmt that defines
  92    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
  93    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
  94    resulting sequence would be:
  95
  96    VS1: vb = px[i];
  97    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
  98    VS2: va = vb;
  99    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 100
 101         Operands that are not SSA_NAMEs, are data-refs that appear in
 102    load/store operations (like 'x[i]' in S1), and are handled differently.
 103
 104    Target modeling:
 105    =================
 106         Currently the only target specific information that is used is the
 107    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 108    support different sizes of vectors, for now will need to specify one value
 109    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 110
 111         Since we only vectorize operations which vector form can be
 112    expressed using existing tree codes, to verify that an operation is
 113    supported, the vectorizer checks the relevant optab at the relevant
 114    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 115    the value found is CODE_FOR_nothing, then there's no target support, and
 116    we can't vectorize the stmt.
 117
 118    For additional information on this project see:
 119    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 120 */
 121
 122 #include "config.h"
 123 #include "system.h"
 124 #include "coretypes.h"
 125 #include "tm.h"
 126 #include "ggc.h"
 127 #include "tree.h"
 128 #include "target.h"
 129 #include "rtl.h"
 130 #include "basic-block.h"
 131 #include "diagnostic.h"
 132 #include "tree-flow.h"
 133 #include "tree-dump.h"
 134 #include "timevar.h"
 135 #include "cfgloop.h"
 136 #include "cfglayout.h"
 137 #include "expr.h"
 138 #include "recog.h"
 139 #include "optabs.h"
 140 #include "params.h"
 141 #include "toplev.h"
 142 #include "tree-chrec.h"
 143 #include "tree-data-ref.h"
 144 #include "tree-scalar-evolution.h"
 145 #include "input.h"
 146 #include "tree-vectorizer.h"
 147 #include "tree-pass.h"
 148
 149 /*************************************************************************
 150   General Vectorization Utilities
 151  *************************************************************************/
 152
 153 /* vect_dump will be set to stderr or dump_file if exist.  */
 154 FILE *vect_dump;
 155
 156 /* vect_verbosity_level set to an invalid value
 157    to mark that it's uninitialized.  */
 158 enum verbosity_levels vect_verbosity_level = MAX_VERBOSITY_LEVEL;
 159
 160 /* Loop location.  */
 161 static LOC vect_loop_location;
 162
 163 /* Bitmap of virtual variables to be renamed.  */
 164 bitmap vect_memsyms_to_rename;
 165 \f
 166 /*************************************************************************
 167   Simple Loop Peeling Utilities
 168
 169   Utilities to support loop peeling for vectorization purposes.
 170  *************************************************************************/
 171
 172
 173 /* Renames the use *OP_P.  */
 174
 175 static void
 176 rename_use_op (use_operand_p op_p)
 177 {
 178   tree new_name;
 179
 180   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
 181     return;
 182
 183   new_name = get_current_def (USE_FROM_PTR (op_p));
 184
 185   /* Something defined outside of the loop.  */
 186   if (!new_name)
 187     return;
 188
 189   /* An ordinary ssa name defined in the loop.  */
 190
 191   SET_USE (op_p, new_name);
 192 }
 193
 194
 195 /* Renames the variables in basic block BB.  */
 196
 197 static void
 198 rename_variables_in_bb (basic_block bb)
 199 {
 200   tree phi;
 201   block_stmt_iterator bsi;
 202   tree stmt;
 203   use_operand_p use_p;
 204   ssa_op_iter iter;
 205   edge e;
 206   edge_iterator ei;
 207   struct loop *loop = bb->loop_father;
 208
 209   for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
 210     {
 211       stmt = bsi_stmt (bsi);
 212       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 213         rename_use_op (use_p);
 214     }
 215
 216   FOR_EACH_EDGE (e, ei, bb->succs)
 217     {
 218       if (!flow_bb_inside_loop_p (loop, e->dest))
 219         continue;
 220       for (phi = phi_nodes (e->dest); phi; phi = PHI_CHAIN (phi))
 221         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (phi, e));
 222     }
 223 }
 224
 225
 226 /* Renames variables in new generated LOOP.  */
 227
 228 void
 229 rename_variables_in_loop (struct loop *loop)
 230 {
 231   unsigned i;
 232   basic_block *bbs;
 233
 234   bbs = get_loop_body (loop);
 235
 236   for (i = 0; i < loop->num_nodes; i++)
 237     rename_variables_in_bb (bbs[i]);
 238
 239   free (bbs);
 240 }
 241
 242
 243 /* Update the PHI nodes of NEW_LOOP.
 244
 245    NEW_LOOP is a duplicate of ORIG_LOOP.
 246    AFTER indicates whether NEW_LOOP executes before or after ORIG_LOOP:
 247    AFTER is true if NEW_LOOP executes after ORIG_LOOP, and false if it
 248    executes before it.  */
 249
 250 static void
 251 slpeel_update_phis_for_duplicate_loop (struct loop *orig_loop,
 252                                        struct loop *new_loop, bool after)
 253 {
 254   tree new_ssa_name;
 255   tree phi_new, phi_orig;
 256   tree def;
 257   edge orig_loop_latch = loop_latch_edge (orig_loop);
 258   edge orig_entry_e = loop_preheader_edge (orig_loop);
 259   edge new_loop_exit_e = single_exit (new_loop);
 260   edge new_loop_entry_e = loop_preheader_edge (new_loop);
 261   edge entry_arg_e = (after ? orig_loop_latch : orig_entry_e);
 262
 263   /*
 264      step 1. For each loop-header-phi:
 265              Add the first phi argument for the phi in NEW_LOOP
 266             (the one associated with the entry of NEW_LOOP)
 267
 268      step 2. For each loop-header-phi:
 269              Add the second phi argument for the phi in NEW_LOOP
 270             (the one associated with the latch of NEW_LOOP)
 271
 272      step 3. Update the phis in the successor block of NEW_LOOP.
 273
 274         case 1: NEW_LOOP was placed before ORIG_LOOP:
 275                 The successor block of NEW_LOOP is the header of ORIG_LOOP.
 276                 Updating the phis in the successor block can therefore be done
 277                 along with the scanning of the loop header phis, because the
 278                 header blocks of ORIG_LOOP and NEW_LOOP have exactly the same
 279                 phi nodes, organized in the same order.
 280
 281         case 2: NEW_LOOP was placed after ORIG_LOOP:
 282                 The successor block of NEW_LOOP is the original exit block of
 283                 ORIG_LOOP - the phis to be updated are the loop-closed-ssa phis.
 284                 We postpone updating these phis to a later stage (when
 285                 loop guards are added).
 286    */
 287
 288
 289   /* Scan the phis in the headers of the old and new loops
 290      (they are organized in exactly the same order).  */
 291
 292   for (phi_new = phi_nodes (new_loop->header),
 293        phi_orig = phi_nodes (orig_loop->header);
 294        phi_new && phi_orig;
 295        phi_new = PHI_CHAIN (phi_new), phi_orig = PHI_CHAIN (phi_orig))
 296     {
 297       /* step 1.  */
 298       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, entry_arg_e);
 299       add_phi_arg (phi_new, def, new_loop_entry_e);
 300
 301       /* step 2.  */
 302       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
 303       if (TREE_CODE (def) != SSA_NAME)
 304         continue;
 305
 306       new_ssa_name = get_current_def (def);
 307       if (!new_ssa_name)
 308         {
 309           /* This only happens if there are no definitions
 310              inside the loop. use the phi_result in this case.  */
 311           new_ssa_name = PHI_RESULT (phi_new);
 312         }
 313
 314       /* An ordinary ssa name defined in the loop.  */
 315       add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop));
 316
 317       /* step 3 (case 1).  */
 318       if (!after)
 319         {
 320           gcc_assert (new_loop_exit_e == orig_entry_e);
 321           SET_PHI_ARG_DEF (phi_orig,
 322                            new_loop_exit_e->dest_idx,
 323                            new_ssa_name);
 324         }
 325     }
 326 }
 327
 328
 329 /* Update PHI nodes for a guard of the LOOP.
 330
 331    Input:
 332    - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
 333         controls whether LOOP is to be executed.  GUARD_EDGE is the edge that
 334         originates from the guard-bb, skips LOOP and reaches the (unique) exit
 335         bb of LOOP.  This loop-exit-bb is an empty bb with one successor.
 336         We denote this bb NEW_MERGE_BB because before the guard code was added
 337         it had a single predecessor (the LOOP header), and now it became a merge
 338         point of two paths - the path that ends with the LOOP exit-edge, and
 339         the path that ends with GUARD_EDGE.
 340    - NEW_EXIT_BB: New basic block that is added by this function between LOOP
 341         and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
 342
 343    ===> The CFG before the guard-code was added:
 344         LOOP_header_bb:
 345           loop_body
 346           if (exit_loop) goto update_bb
 347           else           goto LOOP_header_bb
 348         update_bb:
 349
 350    ==> The CFG after the guard-code was added:
 351         guard_bb:
 352           if (LOOP_guard_condition) goto new_merge_bb
 353           else                      goto LOOP_header_bb
 354         LOOP_header_bb:
 355           loop_body
 356           if (exit_loop_condition) goto new_merge_bb
 357           else                     goto LOOP_header_bb
 358         new_merge_bb:
 359           goto update_bb
 360         update_bb:
 361
 362    ==> The CFG after this function:
 363         guard_bb:
 364           if (LOOP_guard_condition) goto new_merge_bb
 365           else                      goto LOOP_header_bb
 366         LOOP_header_bb:
 367           loop_body
 368           if (exit_loop_condition) goto new_exit_bb
 369           else                     goto LOOP_header_bb
 370         new_exit_bb:
 371         new_merge_bb:
 372           goto update_bb
 373         update_bb:
 374
 375    This function:
 376    1. creates and updates the relevant phi nodes to account for the new
 377       incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
 378       1.1. Create phi nodes at NEW_MERGE_BB.
 379       1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
 380            UPDATE_BB).  UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
 381    2. preserves loop-closed-ssa-form by creating the required phi nodes
 382       at the exit of LOOP (i.e, in NEW_EXIT_BB).
 383
 384    There are two flavors to this function:
 385
 386    slpeel_update_phi_nodes_for_guard1:
 387      Here the guard controls whether we enter or skip LOOP, where LOOP is a
 388      prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
 389      for variables that have phis in the loop header.
 390
 391    slpeel_update_phi_nodes_for_guard2:
 392      Here the guard controls whether we enter or skip LOOP, where LOOP is an
 393      epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
 394      for variables that have phis in the loop exit.
 395
 396    I.E., the overall structure is:
 397
 398         loop1_preheader_bb:
 399                 guard1 (goto loop1/merg1_bb)
 400         loop1
 401         loop1_exit_bb:
 402                 guard2 (goto merge1_bb/merge2_bb)
 403         merge1_bb
 404         loop2
 405         loop2_exit_bb
 406         merge2_bb
 407         next_bb
 408
 409    slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
 410    loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
 411    that have phis in loop1->header).
 412
 413    slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
 414    loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
 415    that have phis in next_bb). It also adds some of these phis to
 416    loop1_exit_bb.
 417
 418    slpeel_update_phi_nodes_for_guard1 is always called before
 419    slpeel_update_phi_nodes_for_guard2. They are both needed in order
 420    to create correct data-flow and loop-closed-ssa-form.
 421
 422    Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
 423    that change between iterations of a loop (and therefore have a phi-node
 424    at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
 425    phis for variables that are used out of the loop (and therefore have
 426    loop-closed exit phis). Some variables may be both updated between
 427    iterations and used after the loop. This is why in loop1_exit_bb we
 428    may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
 429    and exit phis (created by slpeel_update_phi_nodes_for_guard2).
 430
 431    - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
 432      an original loop. i.e., we have:
 433
 434            orig_loop
 435            guard_bb (goto LOOP/new_merge)
 436            new_loop <-- LOOP
 437            new_exit
 438            new_merge
 439            next_bb
 440
 441      If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
 442      have:
 443
 444            new_loop
 445            guard_bb (goto LOOP/new_merge)
 446            orig_loop <-- LOOP
 447            new_exit
 448            new_merge
 449            next_bb
 450
 451      The SSA names defined in the original loop have a current
 452      reaching definition that that records the corresponding new
 453      ssa-name used in the new duplicated loop copy.
 454   */
 455
 456 /* Function slpeel_update_phi_nodes_for_guard1
 457
 458    Input:
 459    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 460    - DEFS - a bitmap of ssa names to mark new names for which we recorded
 461             information.
 462
 463    In the context of the overall structure, we have:
 464
 465         loop1_preheader_bb:
 466                 guard1 (goto loop1/merg1_bb)
 467 LOOP->  loop1
 468         loop1_exit_bb:
 469                 guard2 (goto merge1_bb/merge2_bb)
 470         merge1_bb
 471         loop2
 472         loop2_exit_bb
 473         merge2_bb
 474         next_bb
 475
 476    For each name updated between loop iterations (i.e - for each name that has
 477    an entry (loop-header) phi in LOOP) we create a new phi in:
 478    1. merge1_bb (to account for the edge from guard1)
 479    2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 480 */
 481
 482 static void
 483 slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
 484                                     bool is_new_loop, basic_block *new_exit_bb,
 485                                     bitmap *defs)
 486 {
 487   tree orig_phi, new_phi;
 488   tree update_phi, update_phi2;
 489   tree guard_arg, loop_arg;
 490   basic_block new_merge_bb = guard_edge->dest;
 491   edge e = EDGE_SUCC (new_merge_bb, 0);
 492   basic_block update_bb = e->dest;
 493   basic_block orig_bb = loop->header;
 494   edge new_exit_e;
 495   tree current_new_name;
 496   tree name;
 497
 498   /* Create new bb between loop and new_merge_bb.  */
 499   *new_exit_bb = split_edge (single_exit (loop));
 500
 501   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 502
 503   for (orig_phi = phi_nodes (orig_bb), update_phi = phi_nodes (update_bb);
 504        orig_phi && update_phi;
 505        orig_phi = PHI_CHAIN (orig_phi), update_phi = PHI_CHAIN (update_phi))
 506     {
 507       /* Virtual phi; Mark it for renaming. We actually want to call
 508          mar_sym_for_renaming, but since all ssa renaming datastructures
 509          are going to be freed before we get to call ssa_upate, we just
 510          record this name for now in a bitmap, and will mark it for
 511          renaming later.  */
 512       name = PHI_RESULT (orig_phi);
 513       if (!is_gimple_reg (SSA_NAME_VAR (name)))
 514         bitmap_set_bit (vect_memsyms_to_rename, DECL_UID (SSA_NAME_VAR (name)));
 515
 516       /** 1. Handle new-merge-point phis  **/
 517
 518       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 519       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 520                                  new_merge_bb);
 521
 522       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 523             of LOOP. Set the two phi args in NEW_PHI for these edges:  */
 524       loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
 525       guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
 526
 527       add_phi_arg (new_phi, loop_arg, new_exit_e);
 528       add_phi_arg (new_phi, guard_arg, guard_edge);
 529
 530       /* 1.3. Update phi in successor block.  */
 531       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
 532                   || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
 533       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 534       update_phi2 = new_phi;
 535
 536
 537       /** 2. Handle loop-closed-ssa-form phis  **/
 538
 539       if (!is_gimple_reg (PHI_RESULT (orig_phi)))
 540         continue;
 541
 542       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 543       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 544                                  *new_exit_bb);
 545
 546       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 547       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 548
 549       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 550       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 551       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 552
 553       /* 2.4. Record the newly created name with set_current_def.
 554          We want to find a name such that
 555                 name = get_current_def (orig_loop_name)
 556          and to set its current definition as follows:
 557                 set_current_def (name, new_phi_name)
 558
 559          If LOOP is a new loop then loop_arg is already the name we're
 560          looking for. If LOOP is the original loop, then loop_arg is
 561          the orig_loop_name and the relevant name is recorded in its
 562          current reaching definition.  */
 563       if (is_new_loop)
 564         current_new_name = loop_arg;
 565       else
 566         {
 567           current_new_name = get_current_def (loop_arg);
 568           /* current_def is not available only if the variable does not
 569              change inside the loop, in which case we also don't care
 570              about recording a current_def for it because we won't be
 571              trying to create loop-exit-phis for it.  */
 572           if (!current_new_name)
 573             continue;
 574         }
 575       gcc_assert (get_current_def (current_new_name) == NULL_TREE);
 576
 577       set_current_def (current_new_name, PHI_RESULT (new_phi));
 578       bitmap_set_bit (*defs, SSA_NAME_VERSION (current_new_name));
 579     }
 580
 581   set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
 582 }
 583
 584
 585 /* Function slpeel_update_phi_nodes_for_guard2
 586
 587    Input:
 588    - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
 589
 590    In the context of the overall structure, we have:
 591
 592         loop1_preheader_bb:
 593                 guard1 (goto loop1/merg1_bb)
 594         loop1
 595         loop1_exit_bb:
 596                 guard2 (goto merge1_bb/merge2_bb)
 597         merge1_bb
 598 LOOP->  loop2
 599         loop2_exit_bb
 600         merge2_bb
 601         next_bb
 602
 603    For each name used out side the loop (i.e - for each name that has an exit
 604    phi in next_bb) we create a new phi in:
 605    1. merge2_bb (to account for the edge from guard_bb)
 606    2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
 607    3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
 608       if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
 609 */
 610
 611 static void
 612 slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
 613                                     bool is_new_loop, basic_block *new_exit_bb)
 614 {
 615   tree orig_phi, new_phi;
 616   tree update_phi, update_phi2;
 617   tree guard_arg, loop_arg;
 618   basic_block new_merge_bb = guard_edge->dest;
 619   edge e = EDGE_SUCC (new_merge_bb, 0);
 620   basic_block update_bb = e->dest;
 621   edge new_exit_e;
 622   tree orig_def, orig_def_new_name;
 623   tree new_name, new_name2;
 624   tree arg;
 625
 626   /* Create new bb between loop and new_merge_bb.  */
 627   *new_exit_bb = split_edge (single_exit (loop));
 628
 629   new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
 630
 631   for (update_phi = phi_nodes (update_bb); update_phi;
 632        update_phi = PHI_CHAIN (update_phi))
 633     {
 634       orig_phi = update_phi;
 635       orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
 636       /* This loop-closed-phi actually doesn't represent a use
 637          out of the loop - the phi arg is a constant.  */
 638       if (TREE_CODE (orig_def) != SSA_NAME)
 639         continue;
 640       orig_def_new_name = get_current_def (orig_def);
 641       arg = NULL_TREE;
 642
 643       /** 1. Handle new-merge-point phis  **/
 644
 645       /* 1.1. Generate new phi node in NEW_MERGE_BB:  */
 646       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 647                                  new_merge_bb);
 648
 649       /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
 650             of LOOP. Set the two PHI args in NEW_PHI for these edges:  */
 651       new_name = orig_def;
 652       new_name2 = NULL_TREE;
 653       if (orig_def_new_name)
 654         {
 655           new_name = orig_def_new_name;
 656           /* Some variables have both loop-entry-phis and loop-exit-phis.
 657              Such variables were given yet newer names by phis placed in
 658              guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
 659              new_name2 = get_current_def (get_current_def (orig_name)).  */
 660           new_name2 = get_current_def (new_name);
 661         }
 662
 663       if (is_new_loop)
 664         {
 665           guard_arg = orig_def;
 666           loop_arg = new_name;
 667         }
 668       else
 669         {
 670           guard_arg = new_name;
 671           loop_arg = orig_def;
 672         }
 673       if (new_name2)
 674         guard_arg = new_name2;
 675
 676       add_phi_arg (new_phi, loop_arg, new_exit_e);
 677       add_phi_arg (new_phi, guard_arg, guard_edge);
 678
 679       /* 1.3. Update phi in successor block.  */
 680       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
 681       SET_PHI_ARG_DEF (update_phi, e->dest_idx, PHI_RESULT (new_phi));
 682       update_phi2 = new_phi;
 683
 684
 685       /** 2. Handle loop-closed-ssa-form phis  **/
 686
 687       /* 2.1. Generate new phi node in NEW_EXIT_BB:  */
 688       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 689                                  *new_exit_bb);
 690
 691       /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop.  */
 692       add_phi_arg (new_phi, loop_arg, single_exit (loop));
 693
 694       /* 2.3. Update phi in successor of NEW_EXIT_BB:  */
 695       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
 696       SET_PHI_ARG_DEF (update_phi2, new_exit_e->dest_idx, PHI_RESULT (new_phi));
 697
 698
 699       /** 3. Handle loop-closed-ssa-form phis for first loop  **/
 700
 701       /* 3.1. Find the relevant names that need an exit-phi in
 702          GUARD_BB, i.e. names for which
 703          slpeel_update_phi_nodes_for_guard1 had not already created a
 704          phi node. This is the case for names that are used outside
 705          the loop (and therefore need an exit phi) but are not updated
 706          across loop iterations (and therefore don't have a
 707          loop-header-phi).
 708
 709          slpeel_update_phi_nodes_for_guard1 is responsible for
 710          creating loop-exit phis in GUARD_BB for names that have a
 711          loop-header-phi.  When such a phi is created we also record
 712          the new name in its current definition.  If this new name
 713          exists, then guard_arg was set to this new name (see 1.2
 714          above).  Therefore, if guard_arg is not this new name, this
 715          is an indication that an exit-phi in GUARD_BB was not yet
 716          created, so we take care of it here.  */
 717       if (guard_arg == new_name2)
 718         continue;
 719       arg = guard_arg;
 720
 721       /* 3.2. Generate new phi node in GUARD_BB:  */
 722       new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)),
 723                                  guard_edge->src);
 724
 725       /* 3.3. GUARD_BB has one incoming edge:  */
 726       gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
 727       add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0));
 728
 729       /* 3.4. Update phi in successor of GUARD_BB:  */
 730       gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
 731                                                                 == guard_arg);
 732       SET_PHI_ARG_DEF (update_phi2, guard_edge->dest_idx, PHI_RESULT (new_phi));
 733     }
 734
 735   set_phi_nodes (new_merge_bb, phi_reverse (phi_nodes (new_merge_bb)));
 736 }
 737
 738
 739 /* Make the LOOP iterate NITERS times. This is done by adding a new IV
 740    that starts at zero, increases by one and its limit is NITERS.
 741
 742    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 743
 744 void
 745 slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
 746 {
 747   tree indx_before_incr, indx_after_incr, cond_stmt, cond;
 748   tree orig_cond;
 749   edge exit_edge = single_exit (loop);
 750   block_stmt_iterator loop_cond_bsi;
 751   block_stmt_iterator incr_bsi;
 752   bool insert_after;
 753   tree init = build_int_cst (TREE_TYPE (niters), 0);
 754   tree step = build_int_cst (TREE_TYPE (niters), 1);
 755   LOC loop_loc;
 756
 757   orig_cond = get_loop_exit_condition (loop);
 758   gcc_assert (orig_cond);
 759   loop_cond_bsi = bsi_for_stmt (orig_cond);
 760
 761   standard_iv_increment_position (loop, &incr_bsi, &insert_after);
 762   create_iv (init, step, NULL_TREE, loop,
 763              &incr_bsi, insert_after, &indx_before_incr, &indx_after_incr);
 764
 765   if (exit_edge->flags & EDGE_TRUE_VALUE) /* 'then' edge exits the loop.  */
 766     cond = build2 (GE_EXPR, boolean_type_node, indx_after_incr, niters);
 767   else /* 'then' edge loops back.  */
 768     cond = build2 (LT_EXPR, boolean_type_node, indx_after_incr, niters);
 769
 770   cond_stmt = build3 (COND_EXPR, TREE_TYPE (orig_cond), cond,
 771                       NULL_TREE, NULL_TREE);
 772   bsi_insert_before (&loop_cond_bsi, cond_stmt, BSI_SAME_STMT);
 773
 774   /* Remove old loop exit test:  */
 775   bsi_remove (&loop_cond_bsi, true);
 776
 777   loop_loc = find_loop_location (loop);
 778   if (dump_file && (dump_flags & TDF_DETAILS))
 779     {
 780       if (loop_loc != UNKNOWN_LOC)
 781         fprintf (dump_file, "\nloop at %s:%d: ",
 782                  LOC_FILE (loop_loc), LOC_LINE (loop_loc));
 783       print_generic_expr (dump_file, cond_stmt, TDF_SLIM);
 784     }
 785
 786   loop->nb_iterations = niters;
 787 }
 788
 789
 790 /* Given LOOP this function generates a new copy of it and puts it
 791    on E which is either the entry or exit of LOOP.  */
 792
 793 struct loop *
 794 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e)
 795 {
 796   struct loop *new_loop;
 797   basic_block *new_bbs, *bbs;
 798   bool at_exit;
 799   bool was_imm_dom;
 800   basic_block exit_dest;
 801   tree phi, phi_arg;
 802   edge exit, new_exit;
 803
 804   at_exit = (e == single_exit (loop));
 805   if (!at_exit && e != loop_preheader_edge (loop))
 806     return NULL;
 807
 808   bbs = get_loop_body (loop);
 809
 810   /* Check whether duplication is possible.  */
 811   if (!can_copy_bbs_p (bbs, loop->num_nodes))
 812     {
 813       free (bbs);
 814       return NULL;
 815     }
 816
 817   /* Generate new loop structure.  */
 818   new_loop = duplicate_loop (loop, loop_outer (loop));
 819   if (!new_loop)
 820     {
 821       free (bbs);
 822       return NULL;
 823     }
 824
 825   exit_dest = single_exit (loop)->dest;
 826   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
 827                                           exit_dest) == loop->header ?
 828                  true : false);
 829
 830   new_bbs = XNEWVEC (basic_block, loop->num_nodes);
 831
 832   exit = single_exit (loop);
 833   copy_bbs (bbs, loop->num_nodes, new_bbs,
 834             &exit, 1, &new_exit, NULL,
 835             e->src);
 836
 837   /* Duplicating phi args at exit bbs as coming
 838      also from exit of duplicated loop.  */
 839   for (phi = phi_nodes (exit_dest); phi; phi = PHI_CHAIN (phi))
 840     {
 841       phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop));
 842       if (phi_arg)
 843         {
 844           edge new_loop_exit_edge;
 845
 846           if (EDGE_SUCC (new_loop->header, 0)->dest == new_loop->latch)
 847             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 1);
 848           else
 849             new_loop_exit_edge = EDGE_SUCC (new_loop->header, 0);
 850
 851           add_phi_arg (phi, phi_arg, new_loop_exit_edge);
 852         }
 853     }
 854
 855   if (at_exit) /* Add the loop copy at exit.  */
 856     {
 857       redirect_edge_and_branch_force (e, new_loop->header);
 858       PENDING_STMT (e) = NULL;
 859       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, e->src);
 860       if (was_imm_dom)
 861         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header);
 862     }
 863   else /* Add the copy at entry.  */
 864     {
 865       edge new_exit_e;
 866       edge entry_e = loop_preheader_edge (loop);
 867       basic_block preheader = entry_e->src;
 868
 869       if (!flow_bb_inside_loop_p (new_loop,
 870                                   EDGE_SUCC (new_loop->header, 0)->dest))
 871         new_exit_e = EDGE_SUCC (new_loop->header, 0);
 872       else
 873         new_exit_e = EDGE_SUCC (new_loop->header, 1);
 874
 875       redirect_edge_and_branch_force (new_exit_e, loop->header);
 876       PENDING_STMT (new_exit_e) = NULL;
 877       set_immediate_dominator (CDI_DOMINATORS, loop->header,
 878                                new_exit_e->src);
 879
 880       /* We have to add phi args to the loop->header here as coming
 881          from new_exit_e edge.  */
 882       for (phi = phi_nodes (loop->header); phi; phi = PHI_CHAIN (phi))
 883         {
 884           phi_arg = PHI_ARG_DEF_FROM_EDGE (phi, entry_e);
 885           if (phi_arg)
 886             add_phi_arg (phi, phi_arg, new_exit_e);
 887         }
 888
 889       redirect_edge_and_branch_force (entry_e, new_loop->header);
 890       PENDING_STMT (entry_e) = NULL;
 891       set_immediate_dominator (CDI_DOMINATORS, new_loop->header, preheader);
 892     }
 893
 894   free (new_bbs);
 895   free (bbs);
 896
 897   return new_loop;
 898 }
 899
 900
 901 /* Given the condition statement COND, put it as the last statement
 902    of GUARD_BB; EXIT_BB is the basic block to skip the loop;
 903    Assumes that this is the single exit of the guarded loop.
 904    Returns the skip edge.  */
 905
 906 static edge
 907 slpeel_add_loop_guard (basic_block guard_bb, tree cond, basic_block exit_bb,
 908                        basic_block dom_bb)
 909 {
 910   block_stmt_iterator bsi;
 911   edge new_e, enter_e;
 912   tree cond_stmt;
 913   tree gimplify_stmt_list;
 914
 915   enter_e = EDGE_SUCC (guard_bb, 0);
 916   enter_e->flags &= ~EDGE_FALLTHRU;
 917   enter_e->flags |= EDGE_FALSE_VALUE;
 918   bsi = bsi_last (guard_bb);
 919
 920   cond =
 921     force_gimple_operand (cond, &gimplify_stmt_list, true,
 922                           NULL_TREE);
 923   cond_stmt = build3 (COND_EXPR, void_type_node, cond,
 924                       NULL_TREE, NULL_TREE);
 925   if (gimplify_stmt_list)
 926     bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
 927
 928   bsi = bsi_last (guard_bb);
 929   bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
 930
 931   /* Add new edge to connect guard block to the merge/loop-exit block.  */
 932   new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
 933   set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
 934   return new_e;
 935 }
 936
 937
 938 /* This function verifies that the following restrictions apply to LOOP:
 939    (1) it is innermost
 940    (2) it consists of exactly 2 basic blocks - header, and an empty latch.
 941    (3) it is single entry, single exit
 942    (4) its exit condition is the last stmt in the header
 943    (5) E is the entry/exit edge of LOOP.
 944  */
 945
 946 bool
 947 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
 948 {
 949   edge exit_e = single_exit (loop);
 950   edge entry_e = loop_preheader_edge (loop);
 951   tree orig_cond = get_loop_exit_condition (loop);
 952   block_stmt_iterator loop_exit_bsi = bsi_last (exit_e->src);
 953
 954   if (need_ssa_update_p ())
 955     return false;
 956
 957   if (loop->inner
 958       /* All loops have an outer scope; the only case loop->outer is NULL is for
 959          the function itself.  */
 960       || !loop_outer (loop)
 961       || loop->num_nodes != 2
 962       || !empty_block_p (loop->latch)
 963       || !single_exit (loop)
 964       /* Verify that new loop exit condition can be trivially modified.  */
 965       || (!orig_cond || orig_cond != bsi_stmt (loop_exit_bsi))
 966       || (e != exit_e && e != entry_e))
 967     return false;
 968
 969   return true;
 970 }
 971
 972 #ifdef ENABLE_CHECKING
 973 void
 974 slpeel_verify_cfg_after_peeling (struct loop *first_loop,
 975                                  struct loop *second_loop)
 976 {
 977   basic_block loop1_exit_bb = single_exit (first_loop)->dest;
 978   basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
 979   basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
 980
 981   /* A guard that controls whether the second_loop is to be executed or skipped
 982      is placed in first_loop->exit.  first_loopt->exit therefore has two
 983      successors - one is the preheader of second_loop, and the other is a bb
 984      after second_loop.
 985    */
 986   gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
 987
 988   /* 1. Verify that one of the successors of first_loopt->exit is the preheader
 989         of second_loop.  */
 990
 991   /* The preheader of new_loop is expected to have two predecessors:
 992      first_loop->exit and the block that precedes first_loop.  */
 993
 994   gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
 995               && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
 996                    && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
 997                || (EDGE_PRED (loop2_entry_bb, 1)->src ==  loop1_exit_bb
 998                    && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
 999
1000   /* Verify that the other successor of first_loopt->exit is after the
1001      second_loop.  */
1002   /* TODO */
1003 }
1004 #endif
1005
1006 /* If the run time cost model check determines that vectorization is
1007    not profitable and hence scalar loop should be generated then set
1008    FIRST_NITERS to prologue peeled iterations. This will allow all the
1009    iterations to be executed in the prologue peeled scalar loop.  */
1010
1011 void
1012 set_prologue_iterations (basic_block bb_before_first_loop,
1013                          tree first_niters,
1014                          struct loop *loop,
1015                          unsigned int th)
1016 {
1017   edge e;
1018   basic_block cond_bb, then_bb;
1019   tree var, prologue_after_cost_adjust_name, stmt;
1020   block_stmt_iterator bsi;
1021   tree newphi;
1022   edge e_true, e_false, e_fallthru;
1023   tree cond_stmt;
1024   tree gimplify_stmt_list;
1025   tree cost_pre_condition = NULL_TREE;
1026   tree scalar_loop_iters =
1027     unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
1028
1029   e = single_pred_edge (bb_before_first_loop);
1030   cond_bb = split_edge(e);
1031
1032   e = single_pred_edge (bb_before_first_loop);
1033   then_bb = split_edge(e);
1034   set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
1035
1036   e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
1037                                    EDGE_FALSE_VALUE);
1038   set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
1039
1040   e_true = EDGE_PRED (then_bb, 0);
1041   e_true->flags &= ~EDGE_FALLTHRU;
1042   e_true->flags |= EDGE_TRUE_VALUE;
1043
1044   e_fallthru = EDGE_SUCC (then_bb, 0);
1045
1046   cost_pre_condition =
1047     build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1048             build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1049   cost_pre_condition =
1050     force_gimple_operand (cost_pre_condition, &gimplify_stmt_list,
1051                           true, NULL_TREE);
1052   cond_stmt = build3 (COND_EXPR, void_type_node, cost_pre_condition,
1053                       NULL_TREE, NULL_TREE);
1054
1055   bsi = bsi_last (cond_bb);
1056   if (gimplify_stmt_list)
1057     bsi_insert_after (&bsi, gimplify_stmt_list, BSI_NEW_STMT);
1058
1059   bsi = bsi_last (cond_bb);
1060   bsi_insert_after (&bsi, cond_stmt, BSI_NEW_STMT);
1061
1062   var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
1063                         "prologue_after_cost_adjust");
1064   add_referenced_var (var);
1065   prologue_after_cost_adjust_name =
1066     force_gimple_operand (scalar_loop_iters, &stmt, false, var);
1067
1068   bsi = bsi_last (then_bb);
1069   if (stmt)
1070     bsi_insert_after (&bsi, stmt, BSI_NEW_STMT);
1071
1072   newphi = create_phi_node (var, bb_before_first_loop);
1073   add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru);
1074   add_phi_arg (newphi, first_niters, e_false);
1075
1076   first_niters = PHI_RESULT (newphi);
1077 }
1078
1079
1080 /* Function slpeel_tree_peel_loop_to_edge.
1081
1082    Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
1083    that is placed on the entry (exit) edge E of LOOP. After this transformation
1084    we have two loops one after the other - first-loop iterates FIRST_NITERS
1085    times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
1086    If the cost model indicates that it is profitable to emit a scalar
1087    loop instead of the vector one, then the prolog (epilog) loop will iterate
1088    for the entire unchanged scalar iterations of the loop.
1089
1090    Input:
1091    - LOOP: the loop to be peeled.
1092    - E: the exit or entry edge of LOOP.
1093         If it is the entry edge, we peel the first iterations of LOOP. In this
1094         case first-loop is LOOP, and second-loop is the newly created loop.
1095         If it is the exit edge, we peel the last iterations of LOOP. In this
1096         case, first-loop is the newly created loop, and second-loop is LOOP.
1097    - NITERS: the number of iterations that LOOP iterates.
1098    - FIRST_NITERS: the number of iterations that the first-loop should iterate.
1099    - UPDATE_FIRST_LOOP_COUNT:  specified whether this function is responsible
1100         for updating the loop bound of the first-loop to FIRST_NITERS.  If it
1101         is false, the caller of this function may want to take care of this
1102         (this can be useful if we don't want new stmts added to first-loop).
1103    - TH: cost model profitability threshold of iterations for vectorization.
1104    - CHECK_PROFITABILITY: specify whether cost model check has not occured
1105                           during versioning and hence needs to occur during
1106                           prologue generation or whether cost model check
1107                           has not occured during prologue generation and hence
1108                           needs to occur during epilogue generation.
1109
1110
1111    Output:
1112    The function returns a pointer to the new loop-copy, or NULL if it failed
1113    to perform the transformation.
1114
1115    The function generates two if-then-else guards: one before the first loop,
1116    and the other before the second loop:
1117    The first guard is:
1118      if (FIRST_NITERS == 0) then skip the first loop,
1119      and go directly to the second loop.
1120    The second guard is:
1121      if (FIRST_NITERS == NITERS) then skip the second loop.
1122
1123    FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
1124    FORNOW the resulting code will not be in loop-closed-ssa form.
1125 */
1126
1127 struct loop*
1128 slpeel_tree_peel_loop_to_edge (struct loop *loop,
1129                                edge e, tree first_niters,
1130                                tree niters, bool update_first_loop_count,
1131                                unsigned int th, bool check_profitability)
1132 {
1133   struct loop *new_loop = NULL, *first_loop, *second_loop;
1134   edge skip_e;
1135   tree pre_condition = NULL_TREE;
1136   bitmap definitions;
1137   basic_block bb_before_second_loop, bb_after_second_loop;
1138   basic_block bb_before_first_loop;
1139   basic_block bb_between_loops;
1140   basic_block new_exit_bb;
1141   edge exit_e = single_exit (loop);
1142   LOC loop_loc;
1143   tree cost_pre_condition = NULL_TREE;
1144
1145   if (!slpeel_can_duplicate_loop_p (loop, e))
1146     return NULL;
1147
1148   /* We have to initialize cfg_hooks. Then, when calling
1149    cfg_hooks->split_edge, the function tree_split_edge
1150    is actually called and, when calling cfg_hooks->duplicate_block,
1151    the function tree_duplicate_bb is called.  */
1152   tree_register_cfg_hooks ();
1153
1154
1155   /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
1156         Resulting CFG would be:
1157
1158         first_loop:
1159         do {
1160         } while ...
1161
1162         second_loop:
1163         do {
1164         } while ...
1165
1166         orig_exit_bb:
1167    */
1168
1169   if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e)))
1170     {
1171       loop_loc = find_loop_location (loop);
1172       if (dump_file && (dump_flags & TDF_DETAILS))
1173         {
1174           if (loop_loc != UNKNOWN_LOC)
1175             fprintf (dump_file, "\n%s:%d: note: ",
1176                      LOC_FILE (loop_loc), LOC_LINE (loop_loc));
1177           fprintf (dump_file, "tree_duplicate_loop_to_edge_cfg failed.\n");
1178         }
1179       return NULL;
1180     }
1181
1182   if (e == exit_e)
1183     {
1184       /* NEW_LOOP was placed after LOOP.  */
1185       first_loop = loop;
1186       second_loop = new_loop;
1187     }
1188   else
1189     {
1190       /* NEW_LOOP was placed before LOOP.  */
1191       first_loop = new_loop;
1192       second_loop = loop;
1193     }
1194
1195   definitions = ssa_names_to_replace ();
1196   slpeel_update_phis_for_duplicate_loop (loop, new_loop, e == exit_e);
1197   rename_variables_in_loop (new_loop);
1198
1199
1200   /* 2.  Add the guard code in one of the following ways:
1201
1202      2.a Add the guard that controls whether the first loop is executed.
1203          This occurs when this function is invoked for prologue or epilogiue
1204          generation and when the cost model check can be done at compile time.
1205
1206          Resulting CFG would be:
1207
1208          bb_before_first_loop:
1209          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1210                                 GOTO first-loop
1211
1212          first_loop:
1213          do {
1214          } while ...
1215
1216          bb_before_second_loop:
1217
1218          second_loop:
1219          do {
1220          } while ...
1221
1222          orig_exit_bb:
1223
1224      2.b Add the cost model check that allows the prologue
1225          to iterate for the entire unchanged scalar
1226          iterations of the loop in the event that the cost
1227          model indicates that the scalar loop is more
1228          profitable than the vector one. This occurs when
1229          this function is invoked for prologue generation
1230          and the cost model check needs to be done at run
1231          time.
1232
1233          Resulting CFG after prologue peeling would be:
1234
1235          if (scalar_loop_iterations <= th)
1236            FIRST_NITERS = scalar_loop_iterations
1237
1238          bb_before_first_loop:
1239          if (FIRST_NITERS == 0) GOTO bb_before_second_loop
1240                                 GOTO first-loop
1241
1242          first_loop:
1243          do {
1244          } while ...
1245
1246          bb_before_second_loop:
1247
1248          second_loop:
1249          do {
1250          } while ...
1251
1252          orig_exit_bb:
1253
1254      2.c Add the cost model check that allows the epilogue
1255          to iterate for the entire unchanged scalar
1256          iterations of the loop in the event that the cost
1257          model indicates that the scalar loop is more
1258          profitable than the vector one. This occurs when
1259          this function is invoked for epilogue generation
1260          and the cost model check needs to be done at run
1261          time.
1262
1263          Resulting CFG after prologue peeling would be:
1264
1265          bb_before_first_loop:
1266          if ((scalar_loop_iterations <= th)
1267              ||
1268              FIRST_NITERS == 0) GOTO bb_before_second_loop
1269                                 GOTO first-loop
1270
1271          first_loop:
1272          do {
1273          } while ...
1274
1275          bb_before_second_loop:
1276
1277          second_loop:
1278          do {
1279          } while ...
1280
1281          orig_exit_bb:
1282   */
1283
1284   bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
1285   bb_before_second_loop = split_edge (single_exit (first_loop));
1286
1287   /* Epilogue peeling.  */
1288   if (!update_first_loop_count)
1289     {
1290       pre_condition =
1291         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1292                      build_int_cst (TREE_TYPE (first_niters), 0));
1293       if (check_profitability)
1294         {
1295           tree scalar_loop_iters
1296             = unshare_expr (LOOP_VINFO_NITERS_UNCHANGED
1297                                         (loop_vec_info_for_loop (loop)));
1298           cost_pre_condition =
1299             build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
1300                     build_int_cst (TREE_TYPE (scalar_loop_iters), th));
1301
1302           pre_condition = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
1303                                        cost_pre_condition, pre_condition);
1304         }
1305     }
1306
1307   /* Prologue peeling.  */
1308   else
1309     {
1310       if (check_profitability)
1311         set_prologue_iterations (bb_before_first_loop, first_niters,
1312                                  loop, th);
1313
1314       pre_condition =
1315         fold_build2 (LE_EXPR, boolean_type_node, first_niters,
1316                      build_int_cst (TREE_TYPE (first_niters), 0));
1317     }
1318
1319   skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
1320                                   bb_before_second_loop, bb_before_first_loop);
1321   slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
1322                                       first_loop == new_loop,
1323                                       &new_exit_bb, &definitions);
1324
1325
1326   /* 3. Add the guard that controls whether the second loop is executed.
1327         Resulting CFG would be:
1328
1329         bb_before_first_loop:
1330         if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
1331                                GOTO first-loop
1332
1333         first_loop:
1334         do {
1335         } while ...
1336
1337         bb_between_loops:
1338         if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
1339                                     GOTO bb_before_second_loop
1340
1341         bb_before_second_loop:
1342
1343         second_loop:
1344         do {
1345         } while ...
1346
1347         bb_after_second_loop:
1348
1349         orig_exit_bb:
1350    */
1351
1352   bb_between_loops = new_exit_bb;
1353   bb_after_second_loop = split_edge (single_exit (second_loop));
1354
1355   pre_condition =
1356         fold_build2 (EQ_EXPR, boolean_type_node, first_niters, niters);
1357   skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition,
1358                                   bb_after_second_loop, bb_before_first_loop);
1359   slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
1360                                      second_loop == new_loop, &new_exit_bb);
1361
1362   /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
1363    */
1364   if (update_first_loop_count)
1365     slpeel_make_loop_iterate_ntimes (first_loop, first_niters);
1366
1367   BITMAP_FREE (definitions);
1368   delete_update_ssa ();
1369
1370   return new_loop;
1371 }
1372
1373 /* Function vect_get_loop_location.
1374
1375    Extract the location of the loop in the source code.
1376    If the loop is not well formed for vectorization, an estimated
1377    location is calculated.
1378    Return the loop location if succeed and NULL if not.  */
1379
1380 LOC
1381 find_loop_location (struct loop *loop)
1382 {
1383   tree node = NULL_TREE;
1384   basic_block bb;
1385   block_stmt_iterator si;
1386
1387   if (!loop)
1388     return UNKNOWN_LOC;
1389
1390   node = get_loop_exit_condition (loop);
1391
1392   if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node)
1393       && EXPR_FILENAME (node) && EXPR_LINENO (node))
1394     return EXPR_LOC (node);
1395
1396   /* If we got here the loop is probably not "well formed",
1397      try to estimate the loop location */
1398
1399   if (!loop->header)
1400     return UNKNOWN_LOC;
1401
1402   bb = loop->header;
1403
1404   for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1405     {
1406       node = bsi_stmt (si);
1407       if (node && CAN_HAVE_LOCATION_P (node) && EXPR_HAS_LOCATION (node))
1408         return EXPR_LOC (node);
1409     }
1410
1411   return UNKNOWN_LOC;
1412 }
1413
1414
1415 /*************************************************************************
1416   Vectorization Debug Information.
1417  *************************************************************************/
1418
1419 /* Function vect_set_verbosity_level.
1420
1421    Called from toplev.c upon detection of the
1422    -ftree-vectorizer-verbose=N option.  */
1423
1424 void
1425 vect_set_verbosity_level (const char *val)
1426 {
1427    unsigned int vl;
1428
1429    vl = atoi (val);
1430    if (vl < MAX_VERBOSITY_LEVEL)
1431      vect_verbosity_level = vl;
1432    else
1433      vect_verbosity_level = MAX_VERBOSITY_LEVEL - 1;
1434 }
1435
1436
1437 /* Function vect_set_dump_settings.
1438
1439    Fix the verbosity level of the vectorizer if the
1440    requested level was not set explicitly using the flag
1441    -ftree-vectorizer-verbose=N.
1442    Decide where to print the debugging information (dump_file/stderr).
1443    If the user defined the verbosity level, but there is no dump file,
1444    print to stderr, otherwise print to the dump file.  */
1445
1446 static void
1447 vect_set_dump_settings (void)
1448 {
1449   vect_dump = dump_file;
1450
1451   /* Check if the verbosity level was defined by the user:  */
1452   if (vect_verbosity_level != MAX_VERBOSITY_LEVEL)
1453     {
1454       /* If there is no dump file, print to stderr.  */
1455       if (!dump_file)
1456         vect_dump = stderr;
1457       return;
1458     }
1459
1460   /* User didn't specify verbosity level:  */
1461   if (dump_file && (dump_flags & TDF_DETAILS))
1462     vect_verbosity_level = REPORT_DETAILS;
1463   else if (dump_file && (dump_flags & TDF_STATS))
1464     vect_verbosity_level = REPORT_UNVECTORIZED_LOOPS;
1465   else
1466     vect_verbosity_level = REPORT_NONE;
1467
1468   gcc_assert (dump_file || vect_verbosity_level == REPORT_NONE);
1469 }
1470
1471
1472 /* Function debug_loop_details.
1473
1474    For vectorization debug dumps.  */
1475
1476 bool
1477 vect_print_dump_info (enum verbosity_levels vl)
1478 {
1479   if (vl > vect_verbosity_level)
1480     return false;
1481
1482   if (!current_function_decl || !vect_dump)
1483     return false;
1484
1485   if (vect_loop_location == UNKNOWN_LOC)
1486     fprintf (vect_dump, "\n%s:%d: note: ",
1487              DECL_SOURCE_FILE (current_function_decl),
1488              DECL_SOURCE_LINE (current_function_decl));
1489   else
1490     fprintf (vect_dump, "\n%s:%d: note: ",
1491              LOC_FILE (vect_loop_location), LOC_LINE (vect_loop_location));
1492
1493   return true;
1494 }
1495
1496
1497 /*************************************************************************
1498   Vectorization Utilities.
1499  *************************************************************************/
1500
1501 /* Function new_stmt_vec_info.
1502
1503    Create and initialize a new stmt_vec_info struct for STMT.  */
1504
1505 stmt_vec_info
1506 new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
1507 {
1508   stmt_vec_info res;
1509   res = (stmt_vec_info) xcalloc (1, sizeof (struct _stmt_vec_info));
1510
1511   STMT_VINFO_TYPE (res) = undef_vec_info_type;
1512   STMT_VINFO_STMT (res) = stmt;
1513   STMT_VINFO_LOOP_VINFO (res) = loop_vinfo;
1514   STMT_VINFO_RELEVANT (res) = 0;
1515   STMT_VINFO_LIVE_P (res) = false;
1516   STMT_VINFO_VECTYPE (res) = NULL;
1517   STMT_VINFO_VEC_STMT (res) = NULL;
1518   STMT_VINFO_IN_PATTERN_P (res) = false;
1519   STMT_VINFO_RELATED_STMT (res) = NULL;
1520   STMT_VINFO_DATA_REF (res) = NULL;
1521
1522   STMT_VINFO_DR_BASE_ADDRESS (res) = NULL;
1523   STMT_VINFO_DR_OFFSET (res) = NULL;
1524   STMT_VINFO_DR_INIT (res) = NULL;
1525   STMT_VINFO_DR_STEP (res) = NULL;
1526   STMT_VINFO_DR_ALIGNED_TO (res) = NULL;
1527
1528   if (TREE_CODE (stmt) == PHI_NODE && is_loop_header_bb_p (bb_for_stmt (stmt)))
1529     STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
1530   else
1531     STMT_VINFO_DEF_TYPE (res) = vect_loop_def;
1532   STMT_VINFO_SAME_ALIGN_REFS (res) = VEC_alloc (dr_p, heap, 5);
1533   STMT_VINFO_INSIDE_OF_LOOP_COST (res) = 0;
1534   STMT_VINFO_OUTSIDE_OF_LOOP_COST (res) = 0;
1535   STMT_SLP_TYPE (res) = 0;
1536   DR_GROUP_FIRST_DR (res) = NULL_TREE;
1537   DR_GROUP_NEXT_DR (res) = NULL_TREE;
1538   DR_GROUP_SIZE (res) = 0;
1539   DR_GROUP_STORE_COUNT (res) = 0;
1540   DR_GROUP_GAP (res) = 0;
1541   DR_GROUP_SAME_DR_STMT (res) = NULL_TREE;
1542   DR_GROUP_READ_WRITE_DEPENDENCE (res) = false;
1543
1544   return res;
1545 }
1546
1547
1548 /* Free stmt vectorization related info.  */
1549
1550 void
1551 free_stmt_vec_info (tree stmt)
1552 {
1553   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1554
1555   if (!stmt_info)
1556     return;
1557
1558   VEC_free (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1559   free (stmt_info);
1560   set_stmt_info (stmt_ann (stmt), NULL);
1561 }
1562
1563
1564 /* Function bb_in_loop_p
1565
1566    Used as predicate for dfs order traversal of the loop bbs.  */
1567
1568 static bool
1569 bb_in_loop_p (const_basic_block bb, const void *data)
1570 {
1571   const struct loop *const loop = (const struct loop *)data;
1572   if (flow_bb_inside_loop_p (loop, bb))
1573     return true;
1574   return false;
1575 }
1576
1577
1578 /* Function new_loop_vec_info.
1579
1580    Create and initialize a new loop_vec_info struct for LOOP, as well as
1581    stmt_vec_info structs for all the stmts in LOOP.  */
1582
1583 loop_vec_info
1584 new_loop_vec_info (struct loop *loop)
1585 {
1586   loop_vec_info res;
1587   basic_block *bbs;
1588   block_stmt_iterator si;
1589   unsigned int i, nbbs;
1590
1591   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1592   LOOP_VINFO_LOOP (res) = loop;
1593
1594   bbs = get_loop_body (loop);
1595
1596   /* Create/Update stmt_info for all stmts in the loop.  */
1597   for (i = 0; i < loop->num_nodes; i++)
1598     {
1599       basic_block bb = bbs[i];
1600       tree phi;
1601
1602       /* BBs in a nested inner-loop will have been already processed (because
1603          we will have called vect_analyze_loop_form for any nested inner-loop).
1604          Therefore, for stmts in an inner-loop we just want to update the
1605          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
1606          loop_info of the outer-loop we are currently considering to vectorize
1607          (instead of the loop_info of the inner-loop).
1608          For stmts in other BBs we need to create a stmt_info from scratch.  */
1609       if (bb->loop_father != loop)
1610         {
1611           /* Inner-loop bb.  */
1612           gcc_assert (loop->inner && bb->loop_father == loop->inner);
1613           for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1614             {
1615               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1616               loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1617               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1618               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1619             }
1620           for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1621            {
1622               tree stmt = bsi_stmt (si);
1623               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1624               loop_vec_info inner_loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1625               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
1626               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
1627            }
1628         }
1629       else
1630         {
1631           /* bb in current nest.  */
1632           for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1633             {
1634               stmt_ann_t ann = get_stmt_ann (phi);
1635               set_stmt_info (ann, new_stmt_vec_info (phi, res));
1636             }
1637
1638           for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
1639             {
1640               tree stmt = bsi_stmt (si);
1641               stmt_ann_t ann = stmt_ann (stmt);
1642               set_stmt_info (ann, new_stmt_vec_info (stmt, res));
1643             }
1644         }
1645     }
1646
1647   /* CHECKME: We want to visit all BBs before their successors (except for
1648      latch blocks, for which this assertion wouldn't hold).  In the simple
1649      case of the loop forms we allow, a dfs order of the BBs would the same
1650      as reversed postorder traversal, so we are safe.  */
1651
1652    free (bbs);
1653    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1654    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1655                               bbs, loop->num_nodes, loop);
1656    gcc_assert (nbbs == loop->num_nodes);
1657
1658   LOOP_VINFO_BBS (res) = bbs;
1659   LOOP_VINFO_NITERS (res) = NULL;
1660   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1661   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
1662   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1663   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
1664   LOOP_VINFO_VECT_FACTOR (res) = 0;
1665   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
1666   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
1667   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1668   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
1669     VEC_alloc (tree, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
1670   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
1671     VEC_alloc (ddr_p, heap, PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1672   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (tree, heap, 10);
1673   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
1674   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1675
1676   return res;
1677 }
1678
1679
1680 /* Function destroy_loop_vec_info.
1681
1682    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1683    stmts in the loop.  */
1684
1685 void
1686 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1687 {
1688   struct loop *loop;
1689   basic_block *bbs;
1690   int nbbs;
1691   block_stmt_iterator si;
1692   int j;
1693   VEC (slp_instance, heap) *slp_instances;
1694   slp_instance instance;
1695
1696   if (!loop_vinfo)
1697     return;
1698
1699   loop = LOOP_VINFO_LOOP (loop_vinfo);
1700
1701   bbs = LOOP_VINFO_BBS (loop_vinfo);
1702   nbbs = loop->num_nodes;
1703
1704   if (!clean_stmts)
1705     {
1706       free (LOOP_VINFO_BBS (loop_vinfo));
1707       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1708       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1709       VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1710
1711       free (loop_vinfo);
1712       loop->aux = NULL;
1713       return;
1714     }
1715
1716   for (j = 0; j < nbbs; j++)
1717     {
1718       basic_block bb = bbs[j];
1719       tree phi;
1720
1721       for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
1722         free_stmt_vec_info (phi);
1723
1724       for (si = bsi_start (bb); !bsi_end_p (si); )
1725         {
1726           tree stmt = bsi_stmt (si);
1727           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1728
1729           if (stmt_info)
1730             {
1731               /* Check if this is a "pattern stmt" (introduced by the
1732                  vectorizer during the pattern recognition pass).  */
1733               bool remove_stmt_p = false;
1734               tree orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1735               if (orig_stmt)
1736                 {
1737                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
1738                   if (orig_stmt_info
1739                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
1740                     remove_stmt_p = true;
1741                 }
1742
1743               /* Free stmt_vec_info.  */
1744               free_stmt_vec_info (stmt);
1745
1746               /* Remove dead "pattern stmts".  */
1747               if (remove_stmt_p)
1748                 bsi_remove (&si, true);
1749             }
1750           bsi_next (&si);
1751         }
1752     }
1753
1754   free (LOOP_VINFO_BBS (loop_vinfo));
1755   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
1756   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1757   VEC_free (tree, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1758   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1759   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1760   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
1761     vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
1762   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
1763   VEC_free (tree, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
1764
1765   free (loop_vinfo);
1766   loop->aux = NULL;
1767 }
1768
1769
1770 /* Function vect_force_dr_alignment_p.
1771
1772    Returns whether the alignment of a DECL can be forced to be aligned
1773    on ALIGNMENT bit boundary.  */
1774
1775 bool
1776 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
1777 {
1778   if (TREE_CODE (decl) != VAR_DECL)
1779     return false;
1780
1781   if (DECL_EXTERNAL (decl))
1782     return false;
1783
1784   if (TREE_ASM_WRITTEN (decl))
1785     return false;
1786
1787   if (TREE_STATIC (decl))
1788     return (alignment <= MAX_OFILE_ALIGNMENT);
1789   else
1790     /* This used to be PREFERRED_STACK_BOUNDARY, however, that is not 100%
1791        correct until someone implements forced stack alignment.  */
1792     return (alignment <= STACK_BOUNDARY);
1793 }
1794
1795
1796 /* Function get_vectype_for_scalar_type.
1797
1798    Returns the vector type corresponding to SCALAR_TYPE as supported
1799    by the target.  */
1800
1801 tree
1802 get_vectype_for_scalar_type (tree scalar_type)
1803 {
1804   enum machine_mode inner_mode = TYPE_MODE (scalar_type);
1805   int nbytes = GET_MODE_SIZE (inner_mode);
1806   int nunits;
1807   tree vectype;
1808
1809   if (nbytes == 0 || nbytes >= UNITS_PER_SIMD_WORD)
1810     return NULL_TREE;
1811
1812   /* FORNOW: Only a single vector size per target (UNITS_PER_SIMD_WORD)
1813      is expected.  */
1814   nunits = UNITS_PER_SIMD_WORD / nbytes;
1815
1816   vectype = build_vector_type (scalar_type, nunits);
1817   if (vect_print_dump_info (REPORT_DETAILS))
1818     {
1819       fprintf (vect_dump, "get vectype with %d units of type ", nunits);
1820       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
1821     }
1822
1823   if (!vectype)
1824     return NULL_TREE;
1825
1826   if (vect_print_dump_info (REPORT_DETAILS))
1827     {
1828       fprintf (vect_dump, "vectype: ");
1829       print_generic_expr (vect_dump, vectype, TDF_SLIM);
1830     }
1831
1832   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
1833       && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
1834     {
1835       if (vect_print_dump_info (REPORT_DETAILS))
1836         fprintf (vect_dump, "mode not supported by target.");
1837       return NULL_TREE;
1838     }
1839
1840   return vectype;
1841 }
1842
1843
1844 /* Function vect_supportable_dr_alignment
1845
1846    Return whether the data reference DR is supported with respect to its
1847    alignment.  */
1848
1849 enum dr_alignment_support
1850 vect_supportable_dr_alignment (struct data_reference *dr)
1851 {
1852   tree stmt = DR_STMT (dr);
1853   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1854   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1855   enum machine_mode mode = (int) TYPE_MODE (vectype);
1856   struct loop *vect_loop = LOOP_VINFO_LOOP (STMT_VINFO_LOOP_VINFO (stmt_info));
1857   bool nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
1858   bool invariant_in_outerloop = false;
1859
1860   if (aligned_access_p (dr))
1861     return dr_aligned;
1862
1863   if (nested_in_vect_loop)
1864     {
1865       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
1866       invariant_in_outerloop =
1867         (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
1868     }
1869
1870   /* Possibly unaligned access.  */
1871
1872   /* We can choose between using the implicit realignment scheme (generating
1873      a misaligned_move stmt) and the explicit realignment scheme (generating
1874      aligned loads with a REALIGN_LOAD). There are two variants to the explicit
1875      realignment scheme: optimized, and unoptimized.
1876      We can optimize the realignment only if the step between consecutive
1877      vector loads is equal to the vector size.  Since the vector memory
1878      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
1879      is guaranteed that the misalignment amount remains the same throughout the
1880      execution of the vectorized loop.  Therefore, we can create the
1881      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
1882      at the loop preheader.
1883
1884      However, in the case of outer-loop vectorization, when vectorizing a
1885      memory access in the inner-loop nested within the LOOP that is now being
1886      vectorized, while it is guaranteed that the misalignment of the
1887      vectorized memory access will remain the same in different outer-loop
1888      iterations, it is *not* guaranteed that is will remain the same throughout
1889      the execution of the inner-loop.  This is because the inner-loop advances
1890      with the original scalar step (and not in steps of VS).  If the inner-loop
1891      step happens to be a multiple of VS, then the misalignment remains fixed
1892      and we can use the optimized realignment scheme.  For example:
1893
1894       for (i=0; i<N; i++)
1895         for (j=0; j<M; j++)
1896           s += a[i+j];
1897
1898      When vectorizing the i-loop in the above example, the step between
1899      consecutive vector loads is 1, and so the misalignment does not remain
1900      fixed across the execution of the inner-loop, and the realignment cannot
1901      be optimized (as illustrated in the following pseudo vectorized loop):
1902
1903       for (i=0; i<N; i+=4)
1904         for (j=0; j<M; j++){
1905           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
1906                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
1907                          // (assuming that we start from an aligned address).
1908           }
1909
1910      We therefore have to use the unoptimized realignment scheme:
1911
1912       for (i=0; i<N; i+=4)
1913           for (j=k; j<M; j+=4)
1914           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
1915                            // that the misalignment of the initial address is
1916                            // 0).
1917
1918      The loop can then be vectorized as follows:
1919
1920       for (k=0; k<4; k++){
1921         rt = get_realignment_token (&vp[k]);
1922         for (i=0; i<N; i+=4){
1923           v1 = vp[i+k];
1924           for (j=k; j<M; j+=4){
1925             v2 = vp[i+j+VS-1];
1926             va = REALIGN_LOAD <v1,v2,rt>;
1927             vs += va;
1928             v1 = v2;
1929           }
1930         }
1931     } */
1932
1933   if (DR_IS_READ (dr))
1934     {
1935       if (optab_handler (vec_realign_load_optab, mode)->insn_code !=
1936                                                              CODE_FOR_nothing
1937           && (!targetm.vectorize.builtin_mask_for_load
1938               || targetm.vectorize.builtin_mask_for_load ()))
1939         {
1940             if (nested_in_vect_loop
1941                 && TREE_INT_CST_LOW (DR_STEP (dr)) != UNITS_PER_SIMD_WORD)
1942               return dr_explicit_realign;
1943             else
1944               return dr_explicit_realign_optimized;
1945         }
1946
1947       if (optab_handler (movmisalign_optab, mode)->insn_code !=
1948                                                              CODE_FOR_nothing)
1949         /* Can't software pipeline the loads, but can at least do them.  */
1950         return dr_unaligned_supported;
1951     }
1952
1953   /* Unsupported.  */
1954   return dr_unaligned_unsupported;
1955 }
1956
1957
1958 /* Function vect_is_simple_use.
1959
1960    Input:
1961    LOOP - the loop that is being vectorized.
1962    OPERAND - operand of a stmt in LOOP.
1963    DEF - the defining stmt in case OPERAND is an SSA_NAME.
1964
1965    Returns whether a stmt with OPERAND can be vectorized.
1966    Supportable operands are constants, loop invariants, and operands that are
1967    defined by the current iteration of the loop. Unsupportable operands are
1968    those that are defined by a previous iteration of the loop (as is the case
1969    in reduction/induction computations).  */
1970
1971 bool
1972 vect_is_simple_use (tree operand, loop_vec_info loop_vinfo, tree *def_stmt,
1973                     tree *def, enum vect_def_type *dt)
1974 {
1975   basic_block bb;
1976   stmt_vec_info stmt_vinfo;
1977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1978
1979   *def_stmt = NULL_TREE;
1980   *def = NULL_TREE;
1981
1982   if (vect_print_dump_info (REPORT_DETAILS))
1983     {
1984       fprintf (vect_dump, "vect_is_simple_use: operand ");
1985       print_generic_expr (vect_dump, operand, TDF_SLIM);
1986     }
1987
1988   if (TREE_CODE (operand) == INTEGER_CST || TREE_CODE (operand) == REAL_CST)
1989     {
1990       *dt = vect_constant_def;
1991       return true;
1992     }
1993   if (is_gimple_min_invariant (operand))
1994    {
1995       *def = operand;
1996       *dt = vect_invariant_def;
1997       return true;
1998    }
1999
2000   if (TREE_CODE (operand) == PAREN_EXPR)
2001     {
2002       if (vect_print_dump_info (REPORT_DETAILS))
2003         fprintf (vect_dump, "non-associatable copy.");
2004       operand = TREE_OPERAND (operand, 0);
2005     }
2006   if (TREE_CODE (operand) != SSA_NAME)
2007     {
2008       if (vect_print_dump_info (REPORT_DETAILS))
2009         fprintf (vect_dump, "not ssa-name.");
2010       return false;
2011     }
2012
2013   *def_stmt = SSA_NAME_DEF_STMT (operand);
2014   if (*def_stmt == NULL_TREE )
2015     {
2016       if (vect_print_dump_info (REPORT_DETAILS))
2017         fprintf (vect_dump, "no def_stmt.");
2018       return false;
2019     }
2020
2021   if (vect_print_dump_info (REPORT_DETAILS))
2022     {
2023       fprintf (vect_dump, "def_stmt: ");
2024       print_generic_expr (vect_dump, *def_stmt, TDF_SLIM);
2025     }
2026
2027   /* empty stmt is expected only in case of a function argument.
2028      (Otherwise - we expect a phi_node or a GIMPLE_MODIFY_STMT).  */
2029   if (IS_EMPTY_STMT (*def_stmt))
2030     {
2031       tree arg = TREE_OPERAND (*def_stmt, 0);
2032       if (is_gimple_min_invariant (arg))
2033         {
2034           *def = operand;
2035           *dt = vect_invariant_def;
2036           return true;
2037         }
2038
2039       if (vect_print_dump_info (REPORT_DETAILS))
2040         fprintf (vect_dump, "Unexpected empty stmt.");
2041       return false;
2042     }
2043
2044   bb = bb_for_stmt (*def_stmt);
2045   if (!flow_bb_inside_loop_p (loop, bb))
2046     *dt = vect_invariant_def;
2047   else
2048     {
2049       stmt_vinfo = vinfo_for_stmt (*def_stmt);
2050       *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
2051     }
2052
2053   if (*dt == vect_unknown_def_type)
2054     {
2055       if (vect_print_dump_info (REPORT_DETAILS))
2056         fprintf (vect_dump, "Unsupported pattern.");
2057       return false;
2058     }
2059
2060   if (vect_print_dump_info (REPORT_DETAILS))
2061     fprintf (vect_dump, "type of def: %d.",*dt);
2062
2063   switch (TREE_CODE (*def_stmt))
2064     {
2065     case PHI_NODE:
2066       *def = PHI_RESULT (*def_stmt);
2067       break;
2068
2069     case GIMPLE_MODIFY_STMT:
2070       *def = GIMPLE_STMT_OPERAND (*def_stmt, 0);
2071       break;
2072
2073     default:
2074       if (vect_print_dump_info (REPORT_DETAILS))
2075         fprintf (vect_dump, "unsupported defining stmt: ");
2076       return false;
2077     }
2078
2079   return true;
2080 }
2081
2082
2083 /* Function supportable_widening_operation
2084
2085    Check whether an operation represented by the code CODE is a
2086    widening operation that is supported by the target platform in
2087    vector form (i.e., when operating on arguments of type VECTYPE).
2088
2089    Widening operations we currently support are NOP (CONVERT), FLOAT
2090    and WIDEN_MULT.  This function checks if these operations are supported
2091    by the target platform either directly (via vector tree-codes), or via
2092    target builtins.
2093
2094    Output:
2095    - CODE1 and CODE2 are codes of vector operations to be used when
2096    vectorizing the operation, if available.
2097    - DECL1 and DECL2 are decls of target builtin functions to be used
2098    when vectorizing the operation, if available. In this case,
2099    CODE1 and CODE2 are CALL_EXPR.  */
2100
2101 bool
2102 supportable_widening_operation (enum tree_code code, tree stmt, tree vectype,
2103                                 tree *decl1, tree *decl2,
2104                                 enum tree_code *code1, enum tree_code *code2)
2105 {
2106   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2107   loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
2108   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2109   bool ordered_p;
2110   enum machine_mode vec_mode;
2111   enum insn_code icode1, icode2;
2112   optab optab1, optab2;
2113   tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2114   tree type = TREE_TYPE (expr);
2115   tree wide_vectype = get_vectype_for_scalar_type (type);
2116   enum tree_code c1, c2;
2117
2118   /* The result of a vectorized widening operation usually requires two vectors
2119      (because the widened results do not fit int one vector). The generated
2120      vector results would normally be expected to be generated in the same
2121      order as in the original scalar computation. i.e. if 8 results are
2122      generated in each vector iteration, they are to be organized as follows:
2123         vect1: [res1,res2,res3,res4], vect2: [res5,res6,res7,res8].
2124
2125      However, in the special case that the result of the widening operation is
2126      used in a reduction computation only, the order doesn't matter (because
2127      when vectorizing a reduction we change the order of the computation).
2128      Some targets can take advantage of this and generate more efficient code.
2129      For example, targets like Altivec, that support widen_mult using a sequence
2130      of {mult_even,mult_odd} generate the following vectors:
2131         vect1: [res1,res3,res5,res7], vect2: [res2,res4,res6,res8].
2132
2133      When vectorizaing outer-loops, we execute the inner-loop sequentially
2134      (each vectorized inner-loop iteration contributes to VF outer-loop
2135      iterations in parallel). We therefore don't allow to change the order
2136      of the computation in the inner-loop during outer-loop vectorization.  */
2137
2138    if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_by_reduction
2139        && !nested_in_vect_loop_p (vect_loop, stmt))
2140      ordered_p = false;
2141    else
2142      ordered_p = true;
2143
2144   if (!ordered_p
2145       && code == WIDEN_MULT_EXPR
2146       && targetm.vectorize.builtin_mul_widen_even
2147       && targetm.vectorize.builtin_mul_widen_even (vectype)
2148       && targetm.vectorize.builtin_mul_widen_odd
2149       && targetm.vectorize.builtin_mul_widen_odd (vectype))
2150     {
2151       if (vect_print_dump_info (REPORT_DETAILS))
2152         fprintf (vect_dump, "Unordered widening operation detected.");
2153
2154       *code1 = *code2 = CALL_EXPR;
2155       *decl1 = targetm.vectorize.builtin_mul_widen_even (vectype);
2156       *decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype);
2157       return true;
2158     }
2159
2160   switch (code)
2161     {
2162     case WIDEN_MULT_EXPR:
2163       if (BYTES_BIG_ENDIAN)
2164         {
2165           c1 = VEC_WIDEN_MULT_HI_EXPR;
2166           c2 = VEC_WIDEN_MULT_LO_EXPR;
2167         }
2168       else
2169         {
2170           c2 = VEC_WIDEN_MULT_HI_EXPR;
2171           c1 = VEC_WIDEN_MULT_LO_EXPR;
2172         }
2173       break;
2174
2175     CASE_CONVERT:
2176       if (BYTES_BIG_ENDIAN)
2177         {
2178           c1 = VEC_UNPACK_HI_EXPR;
2179           c2 = VEC_UNPACK_LO_EXPR;
2180         }
2181       else
2182         {
2183           c2 = VEC_UNPACK_HI_EXPR;
2184           c1 = VEC_UNPACK_LO_EXPR;
2185         }
2186       break;
2187
2188     case FLOAT_EXPR:
2189       if (BYTES_BIG_ENDIAN)
2190         {
2191           c1 = VEC_UNPACK_FLOAT_HI_EXPR;
2192           c2 = VEC_UNPACK_FLOAT_LO_EXPR;
2193         }
2194       else
2195         {
2196           c2 = VEC_UNPACK_FLOAT_HI_EXPR;
2197           c1 = VEC_UNPACK_FLOAT_LO_EXPR;
2198         }
2199       break;
2200
2201     case FIX_TRUNC_EXPR:
2202       /* ??? Not yet implemented due to missing VEC_UNPACK_FIX_TRUNC_HI_EXPR/
2203          VEC_UNPACK_FIX_TRUNC_LO_EXPR tree codes and optabs used for
2204          computing the operation.  */
2205       return false;
2206
2207     default:
2208       gcc_unreachable ();
2209     }
2210
2211   if (code == FIX_TRUNC_EXPR)
2212     {
2213       /* The signedness is determined from output operand.  */
2214       optab1 = optab_for_tree_code (c1, type, optab_default);
2215       optab2 = optab_for_tree_code (c2, type, optab_default);
2216     }
2217   else
2218     {
2219       optab1 = optab_for_tree_code (c1, vectype, optab_default);
2220       optab2 = optab_for_tree_code (c2, vectype, optab_default);
2221     }
2222
2223   if (!optab1 || !optab2)
2224     return false;
2225
2226   vec_mode = TYPE_MODE (vectype);
2227   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2228       || insn_data[icode1].operand[0].mode != TYPE_MODE (wide_vectype)
2229       || (icode2 = optab_handler (optab2, vec_mode)->insn_code)
2230                                                         == CODE_FOR_nothing
2231       || insn_data[icode2].operand[0].mode != TYPE_MODE (wide_vectype))
2232     return false;
2233
2234   *code1 = c1;
2235   *code2 = c2;
2236   return true;
2237 }
2238
2239
2240 /* Function supportable_narrowing_operation
2241
2242    Check whether an operation represented by the code CODE is a
2243    narrowing operation that is supported by the target platform in
2244    vector form (i.e., when operating on arguments of type VECTYPE).
2245
2246    Narrowing operations we currently support are NOP (CONVERT) and
2247    FIX_TRUNC. This function checks if these operations are supported by
2248    the target platform directly via vector tree-codes.
2249
2250    Output:
2251    - CODE1 is the code of a vector operation to be used when
2252    vectorizing the operation, if available.  */
2253
2254 bool
2255 supportable_narrowing_operation (enum tree_code code,
2256                                  const_tree stmt, const_tree vectype,
2257                                  enum tree_code *code1)
2258 {
2259   enum machine_mode vec_mode;
2260   enum insn_code icode1;
2261   optab optab1;
2262   tree expr = GIMPLE_STMT_OPERAND (stmt, 1);
2263   tree type = TREE_TYPE (expr);
2264   tree narrow_vectype = get_vectype_for_scalar_type (type);
2265   enum tree_code c1;
2266
2267   switch (code)
2268     {
2269     CASE_CONVERT:
2270       c1 = VEC_PACK_TRUNC_EXPR;
2271       break;
2272
2273     case FIX_TRUNC_EXPR:
2274       c1 = VEC_PACK_FIX_TRUNC_EXPR;
2275       break;
2276
2277     case FLOAT_EXPR:
2278       /* ??? Not yet implemented due to missing VEC_PACK_FLOAT_EXPR
2279          tree code and optabs used for computing the operation.  */
2280       return false;
2281
2282     default:
2283       gcc_unreachable ();
2284     }
2285
2286   if (code == FIX_TRUNC_EXPR)
2287     /* The signedness is determined from output operand.  */
2288     optab1 = optab_for_tree_code (c1, type, optab_default);
2289   else
2290     optab1 = optab_for_tree_code (c1, vectype, optab_default);
2291
2292   if (!optab1)
2293     return false;
2294
2295   vec_mode = TYPE_MODE (vectype);
2296   if ((icode1 = optab_handler (optab1, vec_mode)->insn_code) == CODE_FOR_nothing
2297       || insn_data[icode1].operand[0].mode != TYPE_MODE (narrow_vectype))
2298     return false;
2299
2300   *code1 = c1;
2301   return true;
2302 }
2303
2304
2305 /* Function reduction_code_for_scalar_code
2306
2307    Input:
2308    CODE - tree_code of a reduction operations.
2309
2310    Output:
2311    REDUC_CODE - the corresponding tree-code to be used to reduce the
2312       vector of partial results into a single scalar result (which
2313       will also reside in a vector).
2314
2315    Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
2316
2317 bool
2318 reduction_code_for_scalar_code (enum tree_code code,
2319                                 enum tree_code *reduc_code)
2320 {
2321   switch (code)
2322   {
2323   case MAX_EXPR:
2324     *reduc_code = REDUC_MAX_EXPR;
2325     return true;
2326
2327   case MIN_EXPR:
2328     *reduc_code = REDUC_MIN_EXPR;
2329     return true;
2330
2331   case PLUS_EXPR:
2332     *reduc_code = REDUC_PLUS_EXPR;
2333     return true;
2334
2335   default:
2336     return false;
2337   }
2338 }
2339
2340
2341 /* Function vect_is_simple_reduction
2342
2343    Detect a cross-iteration def-use cycle that represents a simple
2344    reduction computation. We look for the following pattern:
2345
2346    loop_header:
2347      a1 = phi < a0, a2 >
2348      a3 = ...
2349      a2 = operation (a3, a1)
2350
2351    such that:
2352    1. operation is commutative and associative and it is safe to
2353       change the order of the computation.
2354    2. no uses for a2 in the loop (a2 is used out of the loop)
2355    3. no uses of a1 in the loop besides the reduction operation.
2356
2357    Condition 1 is tested here.
2358    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  */
2359
2360 tree
2361 vect_is_simple_reduction (loop_vec_info loop_info, tree phi)
2362 {
2363   struct loop *loop = (bb_for_stmt (phi))->loop_father;
2364   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2365   edge latch_e = loop_latch_edge (loop);
2366   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2367   tree def_stmt, def1, def2;
2368   enum tree_code code;
2369   int op_type;
2370   tree operation, op1, op2;
2371   tree type;
2372   int nloop_uses;
2373   tree name;
2374   imm_use_iterator imm_iter;
2375   use_operand_p use_p;
2376
2377   gcc_assert (loop == vect_loop || flow_loop_nested_p (vect_loop, loop));
2378
2379   name = PHI_RESULT (phi);
2380   nloop_uses = 0;
2381   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2382     {
2383       tree use_stmt = USE_STMT (use_p);
2384       if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2385           && vinfo_for_stmt (use_stmt)
2386           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2387         nloop_uses++;
2388       if (nloop_uses > 1)
2389         {
2390           if (vect_print_dump_info (REPORT_DETAILS))
2391             fprintf (vect_dump, "reduction used in loop.");
2392           return NULL_TREE;
2393         }
2394     }
2395
2396   if (TREE_CODE (loop_arg) != SSA_NAME)
2397     {
2398       if (vect_print_dump_info (REPORT_DETAILS))
2399         {
2400           fprintf (vect_dump, "reduction: not ssa_name: ");
2401           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2402         }
2403       return NULL_TREE;
2404     }
2405
2406   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2407   if (!def_stmt)
2408     {
2409       if (vect_print_dump_info (REPORT_DETAILS))
2410         fprintf (vect_dump, "reduction: no def_stmt.");
2411       return NULL_TREE;
2412     }
2413
2414   if (TREE_CODE (def_stmt) != GIMPLE_MODIFY_STMT)
2415     {
2416       if (vect_print_dump_info (REPORT_DETAILS))
2417         print_generic_expr (vect_dump, def_stmt, TDF_SLIM);
2418       return NULL_TREE;
2419     }
2420
2421   name = GIMPLE_STMT_OPERAND (def_stmt, 0);
2422   nloop_uses = 0;
2423   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2424     {
2425       tree use_stmt = USE_STMT (use_p);
2426       if (flow_bb_inside_loop_p (loop, bb_for_stmt (use_stmt))
2427           && vinfo_for_stmt (use_stmt)
2428           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2429         nloop_uses++;
2430       if (nloop_uses > 1)
2431         {
2432           if (vect_print_dump_info (REPORT_DETAILS))
2433             fprintf (vect_dump, "reduction used in loop.");
2434           return NULL_TREE;
2435         }
2436     }
2437
2438   operation = GIMPLE_STMT_OPERAND (def_stmt, 1);
2439   code = TREE_CODE (operation);
2440   if (!commutative_tree_code (code) || !associative_tree_code (code))
2441     {
2442       if (vect_print_dump_info (REPORT_DETAILS))
2443         {
2444           fprintf (vect_dump, "reduction: not commutative/associative: ");
2445           print_generic_expr (vect_dump, operation, TDF_SLIM);
2446         }
2447       return NULL_TREE;
2448     }
2449
2450   op_type = TREE_OPERAND_LENGTH (operation);
2451   if (op_type != binary_op)
2452     {
2453       if (vect_print_dump_info (REPORT_DETAILS))
2454         {
2455           fprintf (vect_dump, "reduction: not binary operation: ");
2456           print_generic_expr (vect_dump, operation, TDF_SLIM);
2457         }
2458       return NULL_TREE;
2459     }
2460
2461   op1 = TREE_OPERAND (operation, 0);
2462   op2 = TREE_OPERAND (operation, 1);
2463   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
2464     {
2465       if (vect_print_dump_info (REPORT_DETAILS))
2466         {
2467           fprintf (vect_dump, "reduction: uses not ssa_names: ");
2468           print_generic_expr (vect_dump, operation, TDF_SLIM);
2469         }
2470       return NULL_TREE;
2471     }
2472
2473   /* Check that it's ok to change the order of the computation.  */
2474   type = TREE_TYPE (operation);
2475   if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
2476       || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
2477     {
2478       if (vect_print_dump_info (REPORT_DETAILS))
2479         {
2480           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2481           print_generic_expr (vect_dump, type, TDF_SLIM);
2482           fprintf (vect_dump, ", operands types: ");
2483           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2484           fprintf (vect_dump, ",");
2485           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2486         }
2487       return NULL_TREE;
2488     }
2489
2490   /* Generally, when vectorizing a reduction we change the order of the
2491      computation.  This may change the behavior of the program in some
2492      cases, so we need to check that this is ok.  One exception is when
2493      vectorizing an outer-loop: the inner-loop is executed sequentially,
2494      and therefore vectorizing reductions in the inner-loop durint
2495      outer-loop vectorization is safe.  */
2496
2497   /* CHECKME: check for !flag_finite_math_only too?  */
2498   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2499       && !nested_in_vect_loop_p (vect_loop, def_stmt))
2500     {
2501       /* Changing the order of operations changes the semantics.  */
2502       if (vect_print_dump_info (REPORT_DETAILS))
2503         {
2504           fprintf (vect_dump, "reduction: unsafe fp math optimization: ");
2505           print_generic_expr (vect_dump, operation, TDF_SLIM);
2506         }
2507       return NULL_TREE;
2508     }
2509   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2510            && !nested_in_vect_loop_p (vect_loop, def_stmt))
2511     {
2512       /* Changing the order of operations changes the semantics.  */
2513       if (vect_print_dump_info (REPORT_DETAILS))
2514         {
2515           fprintf (vect_dump, "reduction: unsafe int math optimization: ");
2516           print_generic_expr (vect_dump, operation, TDF_SLIM);
2517         }
2518       return NULL_TREE;
2519     }
2520   else if (SAT_FIXED_POINT_TYPE_P (type))
2521     {
2522       /* Changing the order of operations changes the semantics.  */
2523       if (vect_print_dump_info (REPORT_DETAILS))
2524         {
2525           fprintf (vect_dump, "reduction: unsafe fixed-point math optimization: ");
2526           print_generic_expr (vect_dump, operation, TDF_SLIM);
2527         }
2528       return NULL_TREE;
2529     }
2530
2531   /* reduction is safe. we're dealing with one of the following:
2532      1) integer arithmetic and no trapv
2533      2) floating point arithmetic, and special flags permit this optimization.
2534    */
2535   def1 = SSA_NAME_DEF_STMT (op1);
2536   def2 = SSA_NAME_DEF_STMT (op2);
2537   if (!def1 || !def2 || IS_EMPTY_STMT (def1) || IS_EMPTY_STMT (def2))
2538     {
2539       if (vect_print_dump_info (REPORT_DETAILS))
2540         {
2541           fprintf (vect_dump, "reduction: no defs for operands: ");
2542           print_generic_expr (vect_dump, operation, TDF_SLIM);
2543         }
2544       return NULL_TREE;
2545     }
2546
2547
2548   /* Check that one def is the reduction def, defined by PHI,
2549      the other def is either defined in the loop ("vect_loop_def"),
2550      or it's an induction (defined by a loop-header phi-node).  */
2551
2552   if (def2 == phi
2553       && flow_bb_inside_loop_p (loop, bb_for_stmt (def1))
2554       && (TREE_CODE (def1) == GIMPLE_MODIFY_STMT
2555           || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
2556           || (TREE_CODE (def1) == PHI_NODE
2557               && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_loop_def
2558               && !is_loop_header_bb_p (bb_for_stmt (def1)))))
2559     {
2560       if (vect_print_dump_info (REPORT_DETAILS))
2561         {
2562           fprintf (vect_dump, "detected reduction:");
2563           print_generic_expr (vect_dump, operation, TDF_SLIM);
2564         }
2565       return def_stmt;
2566     }
2567   else if (def1 == phi
2568            && flow_bb_inside_loop_p (loop, bb_for_stmt (def2))
2569            && (TREE_CODE (def2) == GIMPLE_MODIFY_STMT
2570                || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_induction_def
2571                || (TREE_CODE (def2) == PHI_NODE
2572                    && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) == vect_loop_def
2573                    && !is_loop_header_bb_p (bb_for_stmt (def2)))))
2574     {
2575       /* Swap operands (just for simplicity - so that the rest of the code
2576          can assume that the reduction variable is always the last (second)
2577          argument).  */
2578       if (vect_print_dump_info (REPORT_DETAILS))
2579         {
2580           fprintf (vect_dump, "detected reduction: need to swap operands:");
2581           print_generic_expr (vect_dump, operation, TDF_SLIM);
2582         }
2583       swap_tree_operands (def_stmt, &TREE_OPERAND (operation, 0),
2584                                     &TREE_OPERAND (operation, 1));
2585       return def_stmt;
2586     }
2587   else
2588     {
2589       if (vect_print_dump_info (REPORT_DETAILS))
2590         {
2591           fprintf (vect_dump, "reduction: unknown pattern.");
2592           print_generic_expr (vect_dump, operation, TDF_SLIM);
2593         }
2594       return NULL_TREE;
2595     }
2596 }
2597
2598
2599 /* Function vect_is_simple_iv_evolution.
2600
2601    FORNOW: A simple evolution of an induction variables in the loop is
2602    considered a polynomial evolution with constant step.  */
2603
2604 bool
2605 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
2606                              tree * step)
2607 {
2608   tree init_expr;
2609   tree step_expr;
2610   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
2611
2612   /* When there is no evolution in this loop, the evolution function
2613      is not "simple".  */
2614   if (evolution_part == NULL_TREE)
2615     return false;
2616
2617   /* When the evolution is a polynomial of degree >= 2
2618      the evolution function is not "simple".  */
2619   if (tree_is_chrec (evolution_part))
2620     return false;
2621
2622   step_expr = evolution_part;
2623   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
2624
2625   if (vect_print_dump_info (REPORT_DETAILS))
2626     {
2627       fprintf (vect_dump, "step: ");
2628       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
2629       fprintf (vect_dump, ",  init: ");
2630       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
2631     }
2632
2633   *init = init_expr;
2634   *step = step_expr;
2635
2636   if (TREE_CODE (step_expr) != INTEGER_CST)
2637     {
2638       if (vect_print_dump_info (REPORT_DETAILS))
2639         fprintf (vect_dump, "step unknown.");
2640       return false;
2641     }
2642
2643   return true;
2644 }
2645
2646
2647 /* Function vectorize_loops.
2648
2649    Entry Point to loop vectorization phase.  */
2650
2651 unsigned
2652 vectorize_loops (void)
2653 {
2654   unsigned int i;
2655   unsigned int num_vectorized_loops = 0;
2656   unsigned int vect_loops_num;
2657   loop_iterator li;
2658   struct loop *loop;
2659
2660   vect_loops_num = number_of_loops ();
2661
2662   /* Bail out if there are no loops.  */
2663   if (vect_loops_num <= 1)
2664     return 0;
2665
2666   /* Fix the verbosity level if not defined explicitly by the user.  */
2667   vect_set_dump_settings ();
2668
2669   /* Allocate the bitmap that records which virtual variables that
2670      need to be renamed.  */
2671   vect_memsyms_to_rename = BITMAP_ALLOC (NULL);
2672
2673   /*  ----------- Analyze loops. -----------  */
2674
2675   /* If some loop was duplicated, it gets bigger number
2676      than all previously defined loops. This fact allows us to run
2677      only over initial loops skipping newly generated ones.  */
2678   FOR_EACH_LOOP (li, loop, 0)
2679     {
2680       loop_vec_info loop_vinfo;
2681
2682       vect_loop_location = find_loop_location (loop);
2683       loop_vinfo = vect_analyze_loop (loop);
2684       loop->aux = loop_vinfo;
2685
2686       if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
2687         continue;
2688
2689       vect_transform_loop (loop_vinfo);
2690       num_vectorized_loops++;
2691     }
2692   vect_loop_location = UNKNOWN_LOC;
2693
2694   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)
2695       || (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)
2696           && num_vectorized_loops > 0))
2697     fprintf (vect_dump, "vectorized %u loops in function.\n",
2698              num_vectorized_loops);
2699
2700   /*  ----------- Finalize. -----------  */
2701
2702   BITMAP_FREE (vect_memsyms_to_rename);
2703
2704   for (i = 1; i < vect_loops_num; i++)
2705     {
2706       loop_vec_info loop_vinfo;
2707
2708       loop = get_loop (i);
2709       if (!loop)
2710         continue;
2711       loop_vinfo = loop->aux;
2712       destroy_loop_vec_info (loop_vinfo, true);
2713       loop->aux = NULL;
2714     }
2715
2716   return num_vectorized_loops > 0 ? TODO_cleanup_cfg : 0;
2717 }
2718
2719 /* Increase alignment of global arrays to improve vectorization potential.
2720    TODO:
2721    - Consider also structs that have an array field.
2722    - Use ipa analysis to prune arrays that can't be vectorized?
2723      This should involve global alignment analysis and in the future also
2724      array padding.  */
2725
2726 static unsigned int
2727 increase_alignment (void)
2728 {
2729   struct varpool_node *vnode;
2730
2731   /* Increase the alignment of all global arrays for vectorization.  */
2732   for (vnode = varpool_nodes_queue;
2733        vnode;
2734        vnode = vnode->next_needed)
2735     {
2736       tree vectype, decl = vnode->decl;
2737       unsigned int alignment;
2738
2739       if (TREE_CODE (TREE_TYPE (decl)) != ARRAY_TYPE)
2740         continue;
2741       vectype = get_vectype_for_scalar_type (TREE_TYPE (TREE_TYPE (decl)));
2742       if (!vectype)
2743         continue;
2744       alignment = TYPE_ALIGN (vectype);
2745       if (DECL_ALIGN (decl) >= alignment)
2746         continue;
2747
2748       if (vect_can_force_dr_alignment_p (decl, alignment))
2749         {
2750           DECL_ALIGN (decl) = TYPE_ALIGN (vectype);
2751           DECL_USER_ALIGN (decl) = 1;
2752           if (dump_file)
2753             {
2754               fprintf (dump_file, "Increasing alignment of decl: ");
2755               print_generic_expr (dump_file, decl, TDF_SLIM);
2756             }
2757         }
2758     }
2759   return 0;
2760 }
2761
2762 static bool
2763 gate_increase_alignment (void)
2764 {
2765   return flag_section_anchors && flag_tree_vectorize;
2766 }
2767
2768 struct simple_ipa_opt_pass pass_ipa_increase_alignment =
2769 {
2770  {
2771   SIMPLE_IPA_PASS,
2772   "increase_alignment",                 /* name */
2773   gate_increase_alignment,              /* gate */
2774   increase_alignment,                   /* execute */
2775   NULL,                                 /* sub */
2776   NULL,                                 /* next */
2777   0,                                    /* static_pass_number */
2778   0,                                    /* tv_id */
2779   0,                                    /* properties_required */
2780   0,                                    /* properties_provided */
2781   0,                                    /* properties_destroyed */
2782   0,                                    /* todo_flags_start */
2783   0                                     /* todo_flags_finish */
2784  }
2785 };