ChangeLog/libgcc
[official-gcc.git] / gcc / modulo-sched.c
blob25cd53a7975ffd528d30b4f4d216ebb9dbc7046e
1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005, 2006, 2007
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA. */
24 #include "config.h"
25 #include "system.h"
26 #include "coretypes.h"
27 #include "tm.h"
28 #include "toplev.h"
29 #include "rtl.h"
30 #include "tm_p.h"
31 #include "hard-reg-set.h"
32 #include "regs.h"
33 #include "function.h"
34 #include "flags.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
37 #include "except.h"
38 #include "toplev.h"
39 #include "recog.h"
40 #include "sched-int.h"
41 #include "target.h"
42 #include "cfglayout.h"
43 #include "cfgloop.h"
44 #include "cfghooks.h"
45 #include "expr.h"
46 #include "params.h"
47 #include "gcov-io.h"
48 #include "ddg.h"
49 #include "timevar.h"
50 #include "tree-pass.h"
52 #ifdef INSN_SCHEDULING
54 /* This file contains the implementation of the Swing Modulo Scheduler,
55 described in the following references:
56 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
57 Lifetime--sensitive modulo scheduling in a production environment.
58 IEEE Trans. on Comps., 50(3), March 2001
59 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
60 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
61 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63 The basic structure is:
64 1. Build a data-dependence graph (DDG) for each loop.
65 2. Use the DDG to order the insns of a loop (not in topological order
66 necessarily, but rather) trying to place each insn after all its
67 predecessors _or_ after all its successors.
68 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
69 4. Use the ordering to perform list-scheduling of the loop:
70 1. Set II = MII. We will try to schedule the loop within II cycles.
71 2. Try to schedule the insns one by one according to the ordering.
72 For each insn compute an interval of cycles by considering already-
73 scheduled preds and succs (and associated latencies); try to place
74 the insn in the cycles of this window checking for potential
75 resource conflicts (using the DFA interface).
76 Note: this is different from the cycle-scheduling of schedule_insns;
77 here the insns are not scheduled monotonically top-down (nor bottom-
78 up).
79 3. If failed in scheduling all insns - bump II++ and try again, unless
80 II reaches an upper bound MaxII, in which case report failure.
81 5. If we succeeded in scheduling the loop within II cycles, we now
82 generate prolog and epilog, decrease the counter of the loop, and
83 perform modulo variable expansion for live ranges that span more than
84 II cycles (i.e. use register copies to prevent a def from overwriting
85 itself before reaching the use).
89 /* This page defines partial-schedule structures and functions for
90 modulo scheduling. */
92 typedef struct partial_schedule *partial_schedule_ptr;
93 typedef struct ps_insn *ps_insn_ptr;
95 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
96 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
98 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
99 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
101 /* Perform signed modulo, always returning a non-negative value. */
102 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
104 /* The number of different iterations the nodes in ps span, assuming
105 the stage boundaries are placed efficiently. */
106 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
107 + 1 + (ps)->ii - 1) / (ps)->ii)
109 /* A single instruction in the partial schedule. */
110 struct ps_insn
112 /* The corresponding DDG_NODE. */
113 ddg_node_ptr node;
115 /* The (absolute) cycle in which the PS instruction is scheduled.
116 Same as SCHED_TIME (node). */
117 int cycle;
119 /* The next/prev PS_INSN in the same row. */
120 ps_insn_ptr next_in_row,
121 prev_in_row;
123 /* The number of nodes in the same row that come after this node. */
124 int row_rest_count;
127 /* Holds the partial schedule as an array of II rows. Each entry of the
128 array points to a linked list of PS_INSNs, which represents the
129 instructions that are scheduled for that row. */
130 struct partial_schedule
132 int ii; /* Number of rows in the partial schedule. */
133 int history; /* Threshold for conflict checking using DFA. */
135 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
136 ps_insn_ptr *rows;
138 /* The earliest absolute cycle of an insn in the partial schedule. */
139 int min_cycle;
141 /* The latest absolute cycle of an insn in the partial schedule. */
142 int max_cycle;
144 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
147 /* We use this to record all the register replacements we do in
148 the kernel so we can undo SMS if it is not profitable. */
149 struct undo_replace_buff_elem
151 rtx insn;
152 rtx orig_reg;
153 rtx new_reg;
154 struct undo_replace_buff_elem *next;
159 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
160 static void free_partial_schedule (partial_schedule_ptr);
161 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
162 void print_partial_schedule (partial_schedule_ptr, FILE *);
163 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
164 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
165 ddg_node_ptr node, int cycle,
166 sbitmap must_precede,
167 sbitmap must_follow);
168 static void rotate_partial_schedule (partial_schedule_ptr, int);
169 void set_row_column_for_ps (partial_schedule_ptr);
170 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
173 /* This page defines constants and structures for the modulo scheduling
174 driver. */
176 /* As in haifa-sched.c: */
177 /* issue_rate is the number of insns that can be scheduled in the same
178 machine cycle. It can be defined in the config/mach/mach.h file,
179 otherwise we set it to 1. */
181 static int issue_rate;
183 static int sms_order_nodes (ddg_ptr, int, int * result);
184 static void set_node_sched_params (ddg_ptr);
185 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
186 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
187 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
188 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
189 int from_stage, int to_stage,
190 int is_prolog);
192 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
193 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
194 #define SCHED_FIRST_REG_MOVE(x) \
195 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
196 #define SCHED_NREG_MOVES(x) \
197 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
198 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
199 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
200 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
202 /* The scheduling parameters held for each node. */
203 typedef struct node_sched_params
205 int asap; /* A lower-bound on the absolute scheduling cycle. */
206 int time; /* The absolute scheduling cycle (time >= asap). */
208 /* The following field (first_reg_move) is a pointer to the first
209 register-move instruction added to handle the modulo-variable-expansion
210 of the register defined by this node. This register-move copies the
211 original register defined by the node. */
212 rtx first_reg_move;
214 /* The number of register-move instructions added, immediately preceding
215 first_reg_move. */
216 int nreg_moves;
218 int row; /* Holds time % ii. */
219 int stage; /* Holds time / ii. */
221 /* The column of a node inside the ps. If nodes u, v are on the same row,
222 u will precede v if column (u) < column (v). */
223 int column;
224 } *node_sched_params_ptr;
227 /* The following three functions are copied from the current scheduler
228 code in order to use sched_analyze() for computing the dependencies.
229 They are used when initializing the sched_info structure. */
230 static const char *
231 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
233 static char tmp[80];
235 sprintf (tmp, "i%4d", INSN_UID (insn));
236 return tmp;
239 static void
240 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
241 regset cond_exec ATTRIBUTE_UNUSED,
242 regset used ATTRIBUTE_UNUSED,
243 regset set ATTRIBUTE_UNUSED)
247 static struct sched_info sms_sched_info =
249 NULL,
250 NULL,
251 NULL,
252 NULL,
253 NULL,
254 sms_print_insn,
255 NULL,
256 compute_jump_reg_dependencies,
257 NULL, NULL,
258 NULL, NULL,
259 0, 0, 0,
261 NULL, NULL, NULL, NULL, NULL,
266 /* Return the register decremented and tested in INSN,
267 or zero if it is not a decrement-and-branch insn. */
269 static rtx
270 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
272 #ifdef HAVE_doloop_end
273 rtx pattern, reg, condition;
275 if (! JUMP_P (insn))
276 return NULL_RTX;
278 pattern = PATTERN (insn);
279 condition = doloop_condition_get (pattern);
280 if (! condition)
281 return NULL_RTX;
283 if (REG_P (XEXP (condition, 0)))
284 reg = XEXP (condition, 0);
285 else if (GET_CODE (XEXP (condition, 0)) == PLUS
286 && REG_P (XEXP (XEXP (condition, 0), 0)))
287 reg = XEXP (XEXP (condition, 0), 0);
288 else
289 gcc_unreachable ();
291 return reg;
292 #else
293 return NULL_RTX;
294 #endif
297 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
298 that the number of iterations is a compile-time constant. If so,
299 return the rtx that sets COUNT_REG to a constant, and set COUNT to
300 this constant. Otherwise return 0. */
301 static rtx
302 const_iteration_count (rtx count_reg, basic_block pre_header,
303 HOST_WIDEST_INT * count)
305 rtx insn;
306 rtx head, tail;
308 if (! pre_header)
309 return NULL_RTX;
311 get_ebb_head_tail (pre_header, pre_header, &head, &tail);
313 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
314 if (INSN_P (insn) && single_set (insn) &&
315 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
317 rtx pat = single_set (insn);
319 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
321 *count = INTVAL (SET_SRC (pat));
322 return insn;
325 return NULL_RTX;
328 return NULL_RTX;
331 /* A very simple resource-based lower bound on the initiation interval.
332 ??? Improve the accuracy of this bound by considering the
333 utilization of various units. */
334 static int
335 res_MII (ddg_ptr g)
337 return (g->num_nodes / issue_rate);
341 /* Points to the array that contains the sched data for each node. */
342 static node_sched_params_ptr node_sched_params;
344 /* Allocate sched_params for each node and initialize it. Assumes that
345 the aux field of each node contain the asap bound (computed earlier),
346 and copies it into the sched_params field. */
347 static void
348 set_node_sched_params (ddg_ptr g)
350 int i;
352 /* Allocate for each node in the DDG a place to hold the "sched_data". */
353 /* Initialize ASAP/ALAP/HIGHT to zero. */
354 node_sched_params = (node_sched_params_ptr)
355 xcalloc (g->num_nodes,
356 sizeof (struct node_sched_params));
358 /* Set the pointer of the general data of the node to point to the
359 appropriate sched_params structure. */
360 for (i = 0; i < g->num_nodes; i++)
362 /* Watch out for aliasing problems? */
363 node_sched_params[i].asap = g->nodes[i].aux.count;
364 g->nodes[i].aux.info = &node_sched_params[i];
368 static void
369 print_node_sched_params (FILE * file, int num_nodes)
371 int i;
373 if (! file)
374 return;
375 for (i = 0; i < num_nodes; i++)
377 node_sched_params_ptr nsp = &node_sched_params[i];
378 rtx reg_move = nsp->first_reg_move;
379 int j;
381 fprintf (file, "Node %d:\n", i);
382 fprintf (file, " asap = %d:\n", nsp->asap);
383 fprintf (file, " time = %d:\n", nsp->time);
384 fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
385 for (j = 0; j < nsp->nreg_moves; j++)
387 fprintf (file, " reg_move = ");
388 print_rtl_single (file, reg_move);
389 reg_move = PREV_INSN (reg_move);
394 /* Calculate an upper bound for II. SMS should not schedule the loop if it
395 requires more cycles than this bound. Currently set to the sum of the
396 longest latency edge for each node. Reset based on experiments. */
397 static int
398 calculate_maxii (ddg_ptr g)
400 int i;
401 int maxii = 0;
403 for (i = 0; i < g->num_nodes; i++)
405 ddg_node_ptr u = &g->nodes[i];
406 ddg_edge_ptr e;
407 int max_edge_latency = 0;
409 for (e = u->out; e; e = e->next_out)
410 max_edge_latency = MAX (max_edge_latency, e->latency);
412 maxii += max_edge_latency;
414 return maxii;
418 Breaking intra-loop register anti-dependences:
419 Each intra-loop register anti-dependence implies a cross-iteration true
420 dependence of distance 1. Therefore, we can remove such false dependencies
421 and figure out if the partial schedule broke them by checking if (for a
422 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
423 if so generate a register move. The number of such moves is equal to:
424 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
425 nreg_moves = ----------------------------------- + 1 - { dependence.
426 ii { 1 if not.
428 static struct undo_replace_buff_elem *
429 generate_reg_moves (partial_schedule_ptr ps)
431 ddg_ptr g = ps->g;
432 int ii = ps->ii;
433 int i;
434 struct undo_replace_buff_elem *reg_move_replaces = NULL;
436 for (i = 0; i < g->num_nodes; i++)
438 ddg_node_ptr u = &g->nodes[i];
439 ddg_edge_ptr e;
440 int nreg_moves = 0, i_reg_move;
441 sbitmap *uses_of_defs;
442 rtx last_reg_move;
443 rtx prev_reg, old_reg;
445 /* Compute the number of reg_moves needed for u, by looking at life
446 ranges started at u (excluding self-loops). */
447 for (e = u->out; e; e = e->next_out)
448 if (e->type == TRUE_DEP && e->dest != e->src)
450 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
452 if (e->distance == 1)
453 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
455 /* If dest precedes src in the schedule of the kernel, then dest
456 will read before src writes and we can save one reg_copy. */
457 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
458 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
459 nreg_moves4e--;
461 nreg_moves = MAX (nreg_moves, nreg_moves4e);
464 if (nreg_moves == 0)
465 continue;
467 /* Every use of the register defined by node may require a different
468 copy of this register, depending on the time the use is scheduled.
469 Set a bitmap vector, telling which nodes use each copy of this
470 register. */
471 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
472 sbitmap_vector_zero (uses_of_defs, nreg_moves);
473 for (e = u->out; e; e = e->next_out)
474 if (e->type == TRUE_DEP && e->dest != e->src)
476 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
478 if (e->distance == 1)
479 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
481 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
482 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
483 dest_copy--;
485 if (dest_copy)
486 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
489 /* Now generate the reg_moves, attaching relevant uses to them. */
490 SCHED_NREG_MOVES (u) = nreg_moves;
491 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
492 last_reg_move = u->insn;
494 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
496 unsigned int i_use = 0;
497 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
498 rtx reg_move = gen_move_insn (new_reg, prev_reg);
499 sbitmap_iterator sbi;
501 add_insn_before (reg_move, last_reg_move, NULL);
502 last_reg_move = reg_move;
504 if (!SCHED_FIRST_REG_MOVE (u))
505 SCHED_FIRST_REG_MOVE (u) = reg_move;
507 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
509 struct undo_replace_buff_elem *rep;
511 rep = (struct undo_replace_buff_elem *)
512 xcalloc (1, sizeof (struct undo_replace_buff_elem));
513 rep->insn = g->nodes[i_use].insn;
514 rep->orig_reg = old_reg;
515 rep->new_reg = new_reg;
517 if (! reg_move_replaces)
518 reg_move_replaces = rep;
519 else
521 rep->next = reg_move_replaces;
522 reg_move_replaces = rep;
525 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
528 prev_reg = new_reg;
530 sbitmap_vector_free (uses_of_defs);
532 return reg_move_replaces;
535 /* We call this when we want to undo the SMS schedule for a given loop.
536 One of the things that we do is to delete the register moves generated
537 for the sake of SMS; this function deletes the register move instructions
538 recorded in the undo buffer. */
539 static void
540 undo_generate_reg_moves (partial_schedule_ptr ps,
541 struct undo_replace_buff_elem *reg_move_replaces)
543 int i,j;
545 for (i = 0; i < ps->g->num_nodes; i++)
547 ddg_node_ptr u = &ps->g->nodes[i];
548 rtx prev;
549 rtx crr = SCHED_FIRST_REG_MOVE (u);
551 for (j = 0; j < SCHED_NREG_MOVES (u); j++)
553 prev = PREV_INSN (crr);
554 delete_insn (crr);
555 crr = prev;
557 SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
560 while (reg_move_replaces)
562 struct undo_replace_buff_elem *rep = reg_move_replaces;
564 reg_move_replaces = reg_move_replaces->next;
565 replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
569 /* Free memory allocated for the undo buffer. */
570 static void
571 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
574 while (reg_move_replaces)
576 struct undo_replace_buff_elem *rep = reg_move_replaces;
578 reg_move_replaces = reg_move_replaces->next;
579 free (rep);
583 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
584 of SCHED_ROW and SCHED_STAGE. */
585 static void
586 normalize_sched_times (partial_schedule_ptr ps)
588 int i;
589 ddg_ptr g = ps->g;
590 int amount = PS_MIN_CYCLE (ps);
591 int ii = ps->ii;
593 /* Don't include the closing branch assuming that it is the last node. */
594 for (i = 0; i < g->num_nodes - 1; i++)
596 ddg_node_ptr u = &g->nodes[i];
597 int normalized_time = SCHED_TIME (u) - amount;
599 gcc_assert (normalized_time >= 0);
601 SCHED_TIME (u) = normalized_time;
602 SCHED_ROW (u) = normalized_time % ii;
603 SCHED_STAGE (u) = normalized_time / ii;
607 /* Set SCHED_COLUMN of each node according to its position in PS. */
608 static void
609 set_columns_for_ps (partial_schedule_ptr ps)
611 int row;
613 for (row = 0; row < ps->ii; row++)
615 ps_insn_ptr cur_insn = ps->rows[row];
616 int column = 0;
618 for (; cur_insn; cur_insn = cur_insn->next_in_row)
619 SCHED_COLUMN (cur_insn->node) = column++;
623 /* Permute the insns according to their order in PS, from row 0 to
624 row ii-1, and position them right before LAST. This schedules
625 the insns of the loop kernel. */
626 static void
627 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
629 int ii = ps->ii;
630 int row;
631 ps_insn_ptr ps_ij;
633 for (row = 0; row < ii ; row++)
634 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
635 if (PREV_INSN (last) != ps_ij->node->insn)
636 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
637 PREV_INSN (last));
640 /* As part of undoing SMS we return to the original ordering of the
641 instructions inside the loop kernel. Given the partial schedule PS, this
642 function returns the ordering of the instruction according to their CUID
643 in the DDG (PS->G), which is the original order of the instruction before
644 performing SMS. */
645 static void
646 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
648 int i;
650 for (i = 0 ; i < ps->g->num_nodes; i++)
651 if (last == ps->g->nodes[i].insn
652 || last == ps->g->nodes[i].first_note)
653 break;
654 else if (PREV_INSN (last) != ps->g->nodes[i].insn)
655 reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
656 PREV_INSN (last));
659 /* Used to generate the prologue & epilogue. Duplicate the subset of
660 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
661 of both), together with a prefix/suffix of their reg_moves. */
662 static void
663 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
664 int to_stage, int for_prolog)
666 int row;
667 ps_insn_ptr ps_ij;
669 for (row = 0; row < ps->ii; row++)
670 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
672 ddg_node_ptr u_node = ps_ij->node;
673 int j, i_reg_moves;
674 rtx reg_move = NULL_RTX;
676 if (for_prolog)
678 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
679 number of reg_moves starting with the second occurrence of
680 u_node, which is generated if its SCHED_STAGE <= to_stage. */
681 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
682 i_reg_moves = MAX (i_reg_moves, 0);
683 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
685 /* The reg_moves start from the *first* reg_move backwards. */
686 if (i_reg_moves)
688 reg_move = SCHED_FIRST_REG_MOVE (u_node);
689 for (j = 1; j < i_reg_moves; j++)
690 reg_move = PREV_INSN (reg_move);
693 else /* It's for the epilog. */
695 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
696 starting to decrease one stage after u_node no longer occurs;
697 that is, generate all reg_moves until
698 SCHED_STAGE (u_node) == from_stage - 1. */
699 i_reg_moves = SCHED_NREG_MOVES (u_node)
700 - (from_stage - SCHED_STAGE (u_node) - 1);
701 i_reg_moves = MAX (i_reg_moves, 0);
702 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
704 /* The reg_moves start from the *last* reg_move forwards. */
705 if (i_reg_moves)
707 reg_move = SCHED_FIRST_REG_MOVE (u_node);
708 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
709 reg_move = PREV_INSN (reg_move);
713 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
714 emit_insn (copy_rtx (PATTERN (reg_move)));
715 if (SCHED_STAGE (u_node) >= from_stage
716 && SCHED_STAGE (u_node) <= to_stage)
717 duplicate_insn_chain (u_node->first_note, u_node->insn);
722 /* Generate the instructions (including reg_moves) for prolog & epilog. */
723 static void
724 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
726 int i;
727 int last_stage = PS_STAGE_COUNT (ps) - 1;
728 edge e;
730 /* Generate the prolog, inserting its insns on the loop-entry edge. */
731 start_sequence ();
733 if (count_reg)
734 /* Generate a subtract instruction at the beginning of the prolog to
735 adjust the loop count by STAGE_COUNT. */
736 emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
738 for (i = 0; i < last_stage; i++)
739 duplicate_insns_of_cycles (ps, 0, i, 1);
741 /* Put the prolog on the entry edge. */
742 e = loop_preheader_edge (loop);
743 split_edge_and_insert (e, get_insns ());
745 end_sequence ();
747 /* Generate the epilog, inserting its insns on the loop-exit edge. */
748 start_sequence ();
750 for (i = 0; i < last_stage; i++)
751 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
753 /* Put the epilogue on the exit edge. */
754 gcc_assert (single_exit (loop));
755 e = single_exit (loop);
756 split_edge_and_insert (e, get_insns ());
757 end_sequence ();
760 /* Return true if all the BBs of the loop are empty except the
761 loop header. */
762 static bool
763 loop_single_full_bb_p (struct loop *loop)
765 unsigned i;
766 basic_block *bbs = get_loop_body (loop);
768 for (i = 0; i < loop->num_nodes ; i++)
770 rtx head, tail;
771 bool empty_bb = true;
773 if (bbs[i] == loop->header)
774 continue;
776 /* Make sure that basic blocks other than the header
777 have only notes labels or jumps. */
778 get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
779 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
781 if (NOTE_P (head) || LABEL_P (head)
782 || (INSN_P (head) && JUMP_P (head)))
783 continue;
784 empty_bb = false;
785 break;
788 if (! empty_bb)
790 free (bbs);
791 return false;
794 free (bbs);
795 return true;
798 /* A simple loop from SMS point of view; it is a loop that is composed of
799 either a single basic block or two BBs - a header and a latch. */
800 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
801 && (EDGE_COUNT (loop->latch->preds) == 1) \
802 && (EDGE_COUNT (loop->latch->succs) == 1))
804 /* Return true if the loop is in its canonical form and false if not.
805 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
806 static bool
807 loop_canon_p (struct loop *loop)
810 if (loop->inner || !loop_outer (loop))
811 return false;
813 if (!single_exit (loop))
815 if (dump_file)
817 rtx insn = BB_END (loop->header);
819 fprintf (dump_file, "SMS loop many exits ");
820 fprintf (dump_file, " %s %d (file, line)\n",
821 insn_file (insn), insn_line (insn));
823 return false;
826 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
828 if (dump_file)
830 rtx insn = BB_END (loop->header);
832 fprintf (dump_file, "SMS loop many BBs. ");
833 fprintf (dump_file, " %s %d (file, line)\n",
834 insn_file (insn), insn_line (insn));
836 return false;
839 return true;
842 /* If there are more than one entry for the loop,
843 make it one by splitting the first entry edge and
844 redirecting the others to the new BB. */
845 static void
846 canon_loop (struct loop *loop)
848 edge e;
849 edge_iterator i;
851 /* Avoid annoying special cases of edges going to exit
852 block. */
853 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
854 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
855 split_edge (e);
857 if (loop->latch == loop->header
858 || EDGE_COUNT (loop->latch->succs) > 1)
860 FOR_EACH_EDGE (e, i, loop->header->preds)
861 if (e->src == loop->latch)
862 break;
863 split_edge (e);
867 /* Probability in % that the sms-ed loop rolls enough so that optimized
868 version may be entered. Just a guess. */
869 #define PROB_SMS_ENOUGH_ITERATIONS 80
871 /* Main entry point, perform SMS scheduling on the loops of the function
872 that consist of single basic blocks. */
873 static void
874 sms_schedule (void)
876 static int passes = 0;
877 rtx insn;
878 ddg_ptr *g_arr, g;
879 int * node_order;
880 int maxii;
881 loop_iterator li;
882 partial_schedule_ptr ps;
883 basic_block bb = NULL;
884 struct loop *loop;
885 basic_block condition_bb = NULL;
886 edge latch_edge;
887 gcov_type trip_count = 0;
889 loop_optimizer_init (LOOPS_HAVE_PREHEADERS
890 | LOOPS_HAVE_RECORDED_EXITS);
891 if (number_of_loops () <= 1)
893 loop_optimizer_finalize ();
894 return; /* There are no loops to schedule. */
897 /* Initialize issue_rate. */
898 if (targetm.sched.issue_rate)
900 int temp = reload_completed;
902 reload_completed = 1;
903 issue_rate = targetm.sched.issue_rate ();
904 reload_completed = temp;
906 else
907 issue_rate = 1;
909 /* Initialize the scheduler. */
910 current_sched_info = &sms_sched_info;
912 /* Init Data Flow analysis, to be used in interloop dep calculation. */
913 df_set_flags (DF_LR_RUN_DCE);
914 df_rd_add_problem ();
915 df_ru_add_problem ();
916 df_note_add_problem ();
917 df_chain_add_problem (DF_DU_CHAIN + DF_UD_CHAIN);
918 df_analyze ();
919 regstat_compute_calls_crossed ();
920 sched_init ();
922 /* Allocate memory to hold the DDG array one entry for each loop.
923 We use loop->num as index into this array. */
924 g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
926 /* Build DDGs for all the relevant loops and hold them in G_ARR
927 indexed by the loop index. */
928 FOR_EACH_LOOP (li, loop, 0)
930 rtx head, tail;
931 rtx count_reg;
933 /* For debugging. */
934 if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
936 if (dump_file)
937 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
939 break;
942 if (! loop_canon_p (loop))
943 continue;
945 if (! loop_single_full_bb_p (loop))
946 continue;
948 bb = loop->header;
950 get_ebb_head_tail (bb, bb, &head, &tail);
951 latch_edge = loop_latch_edge (loop);
952 gcc_assert (single_exit (loop));
953 if (single_exit (loop)->count)
954 trip_count = latch_edge->count / single_exit (loop)->count;
956 /* Perfrom SMS only on loops that their average count is above threshold. */
958 if ( latch_edge->count
959 && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
961 if (dump_file)
963 fprintf (dump_file, " %s %d (file, line)\n",
964 insn_file (tail), insn_line (tail));
965 fprintf (dump_file, "SMS single-bb-loop\n");
966 if (profile_info && flag_branch_probabilities)
968 fprintf (dump_file, "SMS loop-count ");
969 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
970 (HOST_WIDEST_INT) bb->count);
971 fprintf (dump_file, "\n");
972 fprintf (dump_file, "SMS trip-count ");
973 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
974 (HOST_WIDEST_INT) trip_count);
975 fprintf (dump_file, "\n");
976 fprintf (dump_file, "SMS profile-sum-max ");
977 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
978 (HOST_WIDEST_INT) profile_info->sum_max);
979 fprintf (dump_file, "\n");
982 continue;
985 /* Make sure this is a doloop. */
986 if ( !(count_reg = doloop_register_get (tail)))
987 continue;
989 /* Don't handle BBs with calls or barriers, or !single_set insns. */
990 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
991 if (CALL_P (insn)
992 || BARRIER_P (insn)
993 || (INSN_P (insn) && !JUMP_P (insn)
994 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
995 break;
997 if (insn != NEXT_INSN (tail))
999 if (dump_file)
1001 if (CALL_P (insn))
1002 fprintf (dump_file, "SMS loop-with-call\n");
1003 else if (BARRIER_P (insn))
1004 fprintf (dump_file, "SMS loop-with-barrier\n");
1005 else
1006 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1007 print_rtl_single (dump_file, insn);
1010 continue;
1013 if (! (g = create_ddg (bb, 0)))
1015 if (dump_file)
1016 fprintf (dump_file, "SMS doloop\n");
1017 continue;
1020 g_arr[loop->num] = g;
1023 /* We don't want to perform SMS on new loops - created by versioning. */
1024 FOR_EACH_LOOP (li, loop, 0)
1026 rtx head, tail;
1027 rtx count_reg, count_init;
1028 int mii, rec_mii;
1029 unsigned stage_count = 0;
1030 HOST_WIDEST_INT loop_count = 0;
1032 if (! (g = g_arr[loop->num]))
1033 continue;
1035 if (dump_file)
1036 print_ddg (dump_file, g);
1038 get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1040 latch_edge = loop_latch_edge (loop);
1041 gcc_assert (single_exit (loop));
1042 if (single_exit (loop)->count)
1043 trip_count = latch_edge->count / single_exit (loop)->count;
1045 if (dump_file)
1047 fprintf (dump_file, " %s %d (file, line)\n",
1048 insn_file (tail), insn_line (tail));
1049 fprintf (dump_file, "SMS single-bb-loop\n");
1050 if (profile_info && flag_branch_probabilities)
1052 fprintf (dump_file, "SMS loop-count ");
1053 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1054 (HOST_WIDEST_INT) bb->count);
1055 fprintf (dump_file, "\n");
1056 fprintf (dump_file, "SMS profile-sum-max ");
1057 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1058 (HOST_WIDEST_INT) profile_info->sum_max);
1059 fprintf (dump_file, "\n");
1061 fprintf (dump_file, "SMS doloop\n");
1062 fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1063 fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1064 fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1068 /* In case of th loop have doloop register it gets special
1069 handling. */
1070 count_init = NULL_RTX;
1071 if ((count_reg = doloop_register_get (tail)))
1073 basic_block pre_header;
1075 pre_header = loop_preheader_edge (loop)->src;
1076 count_init = const_iteration_count (count_reg, pre_header,
1077 &loop_count);
1079 gcc_assert (count_reg);
1081 if (dump_file && count_init)
1083 fprintf (dump_file, "SMS const-doloop ");
1084 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1085 loop_count);
1086 fprintf (dump_file, "\n");
1089 node_order = XNEWVEC (int, g->num_nodes);
1091 mii = 1; /* Need to pass some estimate of mii. */
1092 rec_mii = sms_order_nodes (g, mii, node_order);
1093 mii = MAX (res_MII (g), rec_mii);
1094 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1096 if (dump_file)
1097 fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1098 rec_mii, mii, maxii);
1100 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1101 ASAP. */
1102 set_node_sched_params (g);
1104 ps = sms_schedule_by_order (g, mii, maxii, node_order);
1106 if (ps)
1107 stage_count = PS_STAGE_COUNT (ps);
1109 /* Stage count of 1 means that there is no interleaving between
1110 iterations, let the scheduling passes do the job. */
1111 if (stage_count < 1
1112 || (count_init && (loop_count <= stage_count))
1113 || (flag_branch_probabilities && (trip_count <= stage_count)))
1115 if (dump_file)
1117 fprintf (dump_file, "SMS failed... \n");
1118 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1119 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1120 fprintf (dump_file, ", trip-count=");
1121 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1122 fprintf (dump_file, ")\n");
1124 continue;
1126 else
1128 int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1129 int new_cycles;
1130 struct undo_replace_buff_elem *reg_move_replaces;
1132 if (dump_file)
1134 fprintf (dump_file,
1135 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1136 stage_count);
1137 print_partial_schedule (ps, dump_file);
1138 fprintf (dump_file,
1139 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1140 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1143 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1144 the closing_branch was scheduled and should appear in the last (ii-1)
1145 row. Otherwise, we are free to schedule the branch, and we let nodes
1146 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1147 row; this should reduce stage_count to minimum. */
1148 normalize_sched_times (ps);
1149 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1150 set_columns_for_ps (ps);
1152 /* Generate the kernel just to be able to measure its cycles. */
1153 permute_partial_schedule (ps, g->closing_branch->first_note);
1154 reg_move_replaces = generate_reg_moves (ps);
1156 /* Get the number of cycles the new kernel expect to execute in. */
1157 new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1159 /* Get back to the original loop so we can do loop versioning. */
1160 undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1161 if (reg_move_replaces)
1162 undo_generate_reg_moves (ps, reg_move_replaces);
1164 if ( new_cycles >= orig_cycles)
1166 /* SMS is not profitable so undo the permutation and reg move generation
1167 and return the kernel to its original state. */
1168 if (dump_file)
1169 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1172 else
1174 canon_loop (loop);
1176 /* case the BCT count is not known , Do loop-versioning */
1177 if (count_reg && ! count_init)
1179 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1180 GEN_INT(stage_count));
1181 unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1182 * REG_BR_PROB_BASE) / 100;
1184 loop_version (loop, comp_rtx, &condition_bb,
1185 prob, prob, REG_BR_PROB_BASE - prob,
1186 true);
1189 /* Set new iteration count of loop kernel. */
1190 if (count_reg && count_init)
1191 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1192 - stage_count + 1);
1194 /* Now apply the scheduled kernel to the RTL of the loop. */
1195 permute_partial_schedule (ps, g->closing_branch->first_note);
1197 /* Mark this loop as software pipelined so the later
1198 scheduling passes doesn't touch it. */
1199 if (! flag_resched_modulo_sched)
1200 g->bb->flags |= BB_DISABLE_SCHEDULE;
1201 /* The life-info is not valid any more. */
1202 df_set_bb_dirty (g->bb);
1204 reg_move_replaces = generate_reg_moves (ps);
1205 if (dump_file)
1206 print_node_sched_params (dump_file, g->num_nodes);
1207 /* Generate prolog and epilog. */
1208 if (count_reg && !count_init)
1209 generate_prolog_epilog (ps, loop, count_reg);
1210 else
1211 generate_prolog_epilog (ps, loop, NULL_RTX);
1213 free_undo_replace_buff (reg_move_replaces);
1216 free_partial_schedule (ps);
1217 free (node_sched_params);
1218 free (node_order);
1219 free_ddg (g);
1222 regstat_free_calls_crossed ();
1223 free (g_arr);
1225 /* Release scheduler data, needed until now because of DFA. */
1226 sched_finish ();
1227 loop_optimizer_finalize ();
1230 /* The SMS scheduling algorithm itself
1231 -----------------------------------
1232 Input: 'O' an ordered list of insns of a loop.
1233 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1235 'Q' is the empty Set
1236 'PS' is the partial schedule; it holds the currently scheduled nodes with
1237 their cycle/slot.
1238 'PSP' previously scheduled predecessors.
1239 'PSS' previously scheduled successors.
1240 't(u)' the cycle where u is scheduled.
1241 'l(u)' is the latency of u.
1242 'd(v,u)' is the dependence distance from v to u.
1243 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1244 the node ordering phase.
1245 'check_hardware_resources_conflicts(u, PS, c)'
1246 run a trace around cycle/slot through DFA model
1247 to check resource conflicts involving instruction u
1248 at cycle c given the partial schedule PS.
1249 'add_to_partial_schedule_at_time(u, PS, c)'
1250 Add the node/instruction u to the partial schedule
1251 PS at time c.
1252 'calculate_register_pressure(PS)'
1253 Given a schedule of instructions, calculate the register
1254 pressure it implies. One implementation could be the
1255 maximum number of overlapping live ranges.
1256 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1257 registers available in the hardware.
1259 1. II = MII.
1260 2. PS = empty list
1261 3. for each node u in O in pre-computed order
1262 4. if (PSP(u) != Q && PSS(u) == Q) then
1263 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1264 6. start = Early_start; end = Early_start + II - 1; step = 1
1265 11. else if (PSP(u) == Q && PSS(u) != Q) then
1266 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1267 13. start = Late_start; end = Late_start - II + 1; step = -1
1268 14. else if (PSP(u) != Q && PSS(u) != Q) then
1269 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1270 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1271 17. start = Early_start;
1272 18. end = min(Early_start + II - 1 , Late_start);
1273 19. step = 1
1274 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1275 21. start = ASAP(u); end = start + II - 1; step = 1
1276 22. endif
1278 23. success = false
1279 24. for (c = start ; c != end ; c += step)
1280 25. if check_hardware_resources_conflicts(u, PS, c) then
1281 26. add_to_partial_schedule_at_time(u, PS, c)
1282 27. success = true
1283 28. break
1284 29. endif
1285 30. endfor
1286 31. if (success == false) then
1287 32. II = II + 1
1288 33. if (II > maxII) then
1289 34. finish - failed to schedule
1290 35. endif
1291 36. goto 2.
1292 37. endif
1293 38. endfor
1294 39. if (calculate_register_pressure(PS) > maxRP) then
1295 40. goto 32.
1296 41. endif
1297 42. compute epilogue & prologue
1298 43. finish - succeeded to schedule
1301 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1302 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1303 set to 0 to save compile time. */
1304 #define DFA_HISTORY SMS_DFA_HISTORY
1306 /* Given the partial schedule PS, this function calculates and returns the
1307 cycles in which we can schedule the node with the given index I.
1308 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1309 noticed that there are several cases in which we fail to SMS the loop
1310 because the sched window of a node is empty due to tight data-deps. In
1311 such cases we want to unschedule some of the predecessors/successors
1312 until we get non-empty scheduling window. It returns -1 if the
1313 scheduling window is empty and zero otherwise. */
1315 static int
1316 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1317 sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1319 int start, step, end;
1320 ddg_edge_ptr e;
1321 int u = nodes_order [i];
1322 ddg_node_ptr u_node = &ps->g->nodes[u];
1323 sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1324 sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1325 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1326 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1327 int psp_not_empty;
1328 int pss_not_empty;
1330 /* 1. compute sched window for u (start, end, step). */
1331 sbitmap_zero (psp);
1332 sbitmap_zero (pss);
1333 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1334 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1336 if (psp_not_empty && !pss_not_empty)
1338 int early_start = INT_MIN;
1340 end = INT_MAX;
1341 for (e = u_node->in; e != 0; e = e->next_in)
1343 ddg_node_ptr v_node = e->src;
1344 if (TEST_BIT (sched_nodes, v_node->cuid))
1346 int node_st = SCHED_TIME (v_node)
1347 + e->latency - (e->distance * ii);
1349 early_start = MAX (early_start, node_st);
1351 if (e->data_type == MEM_DEP)
1352 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1355 start = early_start;
1356 end = MIN (end, early_start + ii);
1357 step = 1;
1360 else if (!psp_not_empty && pss_not_empty)
1362 int late_start = INT_MAX;
1364 end = INT_MIN;
1365 for (e = u_node->out; e != 0; e = e->next_out)
1367 ddg_node_ptr v_node = e->dest;
1368 if (TEST_BIT (sched_nodes, v_node->cuid))
1370 late_start = MIN (late_start,
1371 SCHED_TIME (v_node) - e->latency
1372 + (e->distance * ii));
1373 if (e->data_type == MEM_DEP)
1374 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1377 start = late_start;
1378 end = MAX (end, late_start - ii);
1379 step = -1;
1382 else if (psp_not_empty && pss_not_empty)
1384 int early_start = INT_MIN;
1385 int late_start = INT_MAX;
1387 start = INT_MIN;
1388 end = INT_MAX;
1389 for (e = u_node->in; e != 0; e = e->next_in)
1391 ddg_node_ptr v_node = e->src;
1393 if (TEST_BIT (sched_nodes, v_node->cuid))
1395 early_start = MAX (early_start,
1396 SCHED_TIME (v_node) + e->latency
1397 - (e->distance * ii));
1398 if (e->data_type == MEM_DEP)
1399 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1402 for (e = u_node->out; e != 0; e = e->next_out)
1404 ddg_node_ptr v_node = e->dest;
1406 if (TEST_BIT (sched_nodes, v_node->cuid))
1408 late_start = MIN (late_start,
1409 SCHED_TIME (v_node) - e->latency
1410 + (e->distance * ii));
1411 if (e->data_type == MEM_DEP)
1412 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1415 start = MAX (start, early_start);
1416 end = MIN (end, MIN (early_start + ii, late_start + 1));
1417 step = 1;
1419 else /* psp is empty && pss is empty. */
1421 start = SCHED_ASAP (u_node);
1422 end = start + ii;
1423 step = 1;
1426 *start_p = start;
1427 *step_p = step;
1428 *end_p = end;
1429 sbitmap_free (psp);
1430 sbitmap_free (pss);
1432 if ((start >= end && step == 1) || (start <= end && step == -1))
1433 return -1;
1434 else
1435 return 0;
1438 /* This function implements the scheduling algorithm for SMS according to the
1439 above algorithm. */
1440 static partial_schedule_ptr
1441 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1443 int ii = mii;
1444 int i, c, success;
1445 int try_again_with_larger_ii = true;
1446 int num_nodes = g->num_nodes;
1447 ddg_edge_ptr e;
1448 int start, end, step; /* Place together into one struct? */
1449 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1450 sbitmap must_precede = sbitmap_alloc (num_nodes);
1451 sbitmap must_follow = sbitmap_alloc (num_nodes);
1452 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1454 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1456 sbitmap_ones (tobe_scheduled);
1457 sbitmap_zero (sched_nodes);
1459 while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1460 || try_again_with_larger_ii ) && ii < maxii)
1462 int j;
1463 bool unscheduled_nodes = false;
1465 if (dump_file)
1466 fprintf (dump_file, "Starting with ii=%d\n", ii);
1467 if (try_again_with_larger_ii)
1469 try_again_with_larger_ii = false;
1470 sbitmap_zero (sched_nodes);
1473 for (i = 0; i < num_nodes; i++)
1475 int u = nodes_order[i];
1476 ddg_node_ptr u_node = &ps->g->nodes[u];
1477 rtx insn = u_node->insn;
1479 if (!INSN_P (insn))
1481 RESET_BIT (tobe_scheduled, u);
1482 continue;
1485 if (JUMP_P (insn)) /* Closing branch handled later. */
1487 RESET_BIT (tobe_scheduled, u);
1488 continue;
1491 if (TEST_BIT (sched_nodes, u))
1492 continue;
1494 /* Try to get non-empty scheduling window. */
1495 j = i;
1496 while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1497 && j > 0)
1499 unscheduled_nodes = true;
1500 if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1501 || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1503 ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1504 RESET_BIT (sched_nodes, nodes_order [j - 1]);
1506 j--;
1508 if (j < 0)
1510 /* ??? Try backtracking instead of immediately ii++? */
1511 ii++;
1512 try_again_with_larger_ii = true;
1513 reset_partial_schedule (ps, ii);
1514 break;
1516 /* 2. Try scheduling u in window. */
1517 if (dump_file)
1518 fprintf (dump_file,
1519 "Trying to schedule node %d in (%d .. %d) step %d\n",
1520 u, start, end, step);
1522 /* use must_follow & must_precede bitmaps to determine order
1523 of nodes within the cycle. */
1524 sbitmap_zero (must_precede);
1525 sbitmap_zero (must_follow);
1526 for (e = u_node->in; e != 0; e = e->next_in)
1527 if (TEST_BIT (sched_nodes, e->src->cuid)
1528 && e->latency == (ii * e->distance)
1529 && start == SCHED_TIME (e->src))
1530 SET_BIT (must_precede, e->src->cuid);
1532 for (e = u_node->out; e != 0; e = e->next_out)
1533 if (TEST_BIT (sched_nodes, e->dest->cuid)
1534 && e->latency == (ii * e->distance)
1535 && end == SCHED_TIME (e->dest))
1536 SET_BIT (must_follow, e->dest->cuid);
1538 success = 0;
1539 if ((step > 0 && start < end) || (step < 0 && start > end))
1540 for (c = start; c != end; c += step)
1542 ps_insn_ptr psi;
1544 psi = ps_add_node_check_conflicts (ps, u_node, c,
1545 must_precede,
1546 must_follow);
1548 if (psi)
1550 SCHED_TIME (u_node) = c;
1551 SET_BIT (sched_nodes, u);
1552 success = 1;
1553 if (dump_file)
1554 fprintf (dump_file, "Schedule in %d\n", c);
1555 break;
1558 if (!success)
1560 /* ??? Try backtracking instead of immediately ii++? */
1561 ii++;
1562 try_again_with_larger_ii = true;
1563 reset_partial_schedule (ps, ii);
1564 break;
1566 if (unscheduled_nodes)
1567 break;
1569 /* ??? If (success), check register pressure estimates. */
1570 } /* Continue with next node. */
1571 } /* While try_again_with_larger_ii. */
1573 sbitmap_free (sched_nodes);
1574 sbitmap_free (must_precede);
1575 sbitmap_free (must_follow);
1576 sbitmap_free (tobe_scheduled);
1578 if (ii >= maxii)
1580 free_partial_schedule (ps);
1581 ps = NULL;
1583 return ps;
1587 /* This page implements the algorithm for ordering the nodes of a DDG
1588 for modulo scheduling, activated through the
1589 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1591 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1592 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1593 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1594 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1595 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1596 #define DEPTH(x) (ASAP ((x)))
1598 typedef struct node_order_params * nopa;
1600 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1601 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1602 static nopa calculate_order_params (ddg_ptr, int mii);
1603 static int find_max_asap (ddg_ptr, sbitmap);
1604 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1605 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1607 enum sms_direction {BOTTOMUP, TOPDOWN};
1609 struct node_order_params
1611 int asap;
1612 int alap;
1613 int height;
1616 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1617 static void
1618 check_nodes_order (int *node_order, int num_nodes)
1620 int i;
1621 sbitmap tmp = sbitmap_alloc (num_nodes);
1623 sbitmap_zero (tmp);
1625 for (i = 0; i < num_nodes; i++)
1627 int u = node_order[i];
1629 gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1631 SET_BIT (tmp, u);
1634 sbitmap_free (tmp);
1637 /* Order the nodes of G for scheduling and pass the result in
1638 NODE_ORDER. Also set aux.count of each node to ASAP.
1639 Return the recMII for the given DDG. */
1640 static int
1641 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1643 int i;
1644 int rec_mii = 0;
1645 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1647 nopa nops = calculate_order_params (g, mii);
1649 order_nodes_of_sccs (sccs, node_order);
1651 if (sccs->num_sccs > 0)
1652 /* First SCC has the largest recurrence_length. */
1653 rec_mii = sccs->sccs[0]->recurrence_length;
1655 /* Save ASAP before destroying node_order_params. */
1656 for (i = 0; i < g->num_nodes; i++)
1658 ddg_node_ptr v = &g->nodes[i];
1659 v->aux.count = ASAP (v);
1662 free (nops);
1663 free_ddg_all_sccs (sccs);
1664 check_nodes_order (node_order, g->num_nodes);
1666 return rec_mii;
1669 static void
1670 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1672 int i, pos = 0;
1673 ddg_ptr g = all_sccs->ddg;
1674 int num_nodes = g->num_nodes;
1675 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1676 sbitmap on_path = sbitmap_alloc (num_nodes);
1677 sbitmap tmp = sbitmap_alloc (num_nodes);
1678 sbitmap ones = sbitmap_alloc (num_nodes);
1680 sbitmap_zero (prev_sccs);
1681 sbitmap_ones (ones);
1683 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1684 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1685 for (i = 0; i < all_sccs->num_sccs; i++)
1687 ddg_scc_ptr scc = all_sccs->sccs[i];
1689 /* Add nodes on paths from previous SCCs to the current SCC. */
1690 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1691 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1693 /* Add nodes on paths from the current SCC to previous SCCs. */
1694 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1695 sbitmap_a_or_b (tmp, tmp, on_path);
1697 /* Remove nodes of previous SCCs from current extended SCC. */
1698 sbitmap_difference (tmp, tmp, prev_sccs);
1700 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1701 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1704 /* Handle the remaining nodes that do not belong to any scc. Each call
1705 to order_nodes_in_scc handles a single connected component. */
1706 while (pos < g->num_nodes)
1708 sbitmap_difference (tmp, ones, prev_sccs);
1709 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1711 sbitmap_free (prev_sccs);
1712 sbitmap_free (on_path);
1713 sbitmap_free (tmp);
1714 sbitmap_free (ones);
1717 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1718 static struct node_order_params *
1719 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1721 int u;
1722 int max_asap;
1723 int num_nodes = g->num_nodes;
1724 ddg_edge_ptr e;
1725 /* Allocate a place to hold ordering params for each node in the DDG. */
1726 nopa node_order_params_arr;
1728 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1729 node_order_params_arr = (nopa) xcalloc (num_nodes,
1730 sizeof (struct node_order_params));
1732 /* Set the aux pointer of each node to point to its order_params structure. */
1733 for (u = 0; u < num_nodes; u++)
1734 g->nodes[u].aux.info = &node_order_params_arr[u];
1736 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1737 calculate ASAP, ALAP, mobility, distance, and height for each node
1738 in the dependence (direct acyclic) graph. */
1740 /* We assume that the nodes in the array are in topological order. */
1742 max_asap = 0;
1743 for (u = 0; u < num_nodes; u++)
1745 ddg_node_ptr u_node = &g->nodes[u];
1747 ASAP (u_node) = 0;
1748 for (e = u_node->in; e; e = e->next_in)
1749 if (e->distance == 0)
1750 ASAP (u_node) = MAX (ASAP (u_node),
1751 ASAP (e->src) + e->latency);
1752 max_asap = MAX (max_asap, ASAP (u_node));
1755 for (u = num_nodes - 1; u > -1; u--)
1757 ddg_node_ptr u_node = &g->nodes[u];
1759 ALAP (u_node) = max_asap;
1760 HEIGHT (u_node) = 0;
1761 for (e = u_node->out; e; e = e->next_out)
1762 if (e->distance == 0)
1764 ALAP (u_node) = MIN (ALAP (u_node),
1765 ALAP (e->dest) - e->latency);
1766 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1767 HEIGHT (e->dest) + e->latency);
1771 return node_order_params_arr;
1774 static int
1775 find_max_asap (ddg_ptr g, sbitmap nodes)
1777 unsigned int u = 0;
1778 int max_asap = -1;
1779 int result = -1;
1780 sbitmap_iterator sbi;
1782 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1784 ddg_node_ptr u_node = &g->nodes[u];
1786 if (max_asap < ASAP (u_node))
1788 max_asap = ASAP (u_node);
1789 result = u;
1792 return result;
1795 static int
1796 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1798 unsigned int u = 0;
1799 int max_hv = -1;
1800 int min_mob = INT_MAX;
1801 int result = -1;
1802 sbitmap_iterator sbi;
1804 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1806 ddg_node_ptr u_node = &g->nodes[u];
1808 if (max_hv < HEIGHT (u_node))
1810 max_hv = HEIGHT (u_node);
1811 min_mob = MOB (u_node);
1812 result = u;
1814 else if ((max_hv == HEIGHT (u_node))
1815 && (min_mob > MOB (u_node)))
1817 min_mob = MOB (u_node);
1818 result = u;
1821 return result;
1824 static int
1825 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1827 unsigned int u = 0;
1828 int max_dv = -1;
1829 int min_mob = INT_MAX;
1830 int result = -1;
1831 sbitmap_iterator sbi;
1833 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1835 ddg_node_ptr u_node = &g->nodes[u];
1837 if (max_dv < DEPTH (u_node))
1839 max_dv = DEPTH (u_node);
1840 min_mob = MOB (u_node);
1841 result = u;
1843 else if ((max_dv == DEPTH (u_node))
1844 && (min_mob > MOB (u_node)))
1846 min_mob = MOB (u_node);
1847 result = u;
1850 return result;
1853 /* Places the nodes of SCC into the NODE_ORDER array starting
1854 at position POS, according to the SMS ordering algorithm.
1855 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1856 the NODE_ORDER array, starting from position zero. */
1857 static int
1858 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1859 int * node_order, int pos)
1861 enum sms_direction dir;
1862 int num_nodes = g->num_nodes;
1863 sbitmap workset = sbitmap_alloc (num_nodes);
1864 sbitmap tmp = sbitmap_alloc (num_nodes);
1865 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1866 sbitmap predecessors = sbitmap_alloc (num_nodes);
1867 sbitmap successors = sbitmap_alloc (num_nodes);
1869 sbitmap_zero (predecessors);
1870 find_predecessors (predecessors, g, nodes_ordered);
1872 sbitmap_zero (successors);
1873 find_successors (successors, g, nodes_ordered);
1875 sbitmap_zero (tmp);
1876 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1878 sbitmap_copy (workset, tmp);
1879 dir = BOTTOMUP;
1881 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1883 sbitmap_copy (workset, tmp);
1884 dir = TOPDOWN;
1886 else
1888 int u;
1890 sbitmap_zero (workset);
1891 if ((u = find_max_asap (g, scc)) >= 0)
1892 SET_BIT (workset, u);
1893 dir = BOTTOMUP;
1896 sbitmap_zero (zero_bitmap);
1897 while (!sbitmap_equal (workset, zero_bitmap))
1899 int v;
1900 ddg_node_ptr v_node;
1901 sbitmap v_node_preds;
1902 sbitmap v_node_succs;
1904 if (dir == TOPDOWN)
1906 while (!sbitmap_equal (workset, zero_bitmap))
1908 v = find_max_hv_min_mob (g, workset);
1909 v_node = &g->nodes[v];
1910 node_order[pos++] = v;
1911 v_node_succs = NODE_SUCCESSORS (v_node);
1912 sbitmap_a_and_b (tmp, v_node_succs, scc);
1914 /* Don't consider the already ordered successors again. */
1915 sbitmap_difference (tmp, tmp, nodes_ordered);
1916 sbitmap_a_or_b (workset, workset, tmp);
1917 RESET_BIT (workset, v);
1918 SET_BIT (nodes_ordered, v);
1920 dir = BOTTOMUP;
1921 sbitmap_zero (predecessors);
1922 find_predecessors (predecessors, g, nodes_ordered);
1923 sbitmap_a_and_b (workset, predecessors, scc);
1925 else
1927 while (!sbitmap_equal (workset, zero_bitmap))
1929 v = find_max_dv_min_mob (g, workset);
1930 v_node = &g->nodes[v];
1931 node_order[pos++] = v;
1932 v_node_preds = NODE_PREDECESSORS (v_node);
1933 sbitmap_a_and_b (tmp, v_node_preds, scc);
1935 /* Don't consider the already ordered predecessors again. */
1936 sbitmap_difference (tmp, tmp, nodes_ordered);
1937 sbitmap_a_or_b (workset, workset, tmp);
1938 RESET_BIT (workset, v);
1939 SET_BIT (nodes_ordered, v);
1941 dir = TOPDOWN;
1942 sbitmap_zero (successors);
1943 find_successors (successors, g, nodes_ordered);
1944 sbitmap_a_and_b (workset, successors, scc);
1947 sbitmap_free (tmp);
1948 sbitmap_free (workset);
1949 sbitmap_free (zero_bitmap);
1950 sbitmap_free (predecessors);
1951 sbitmap_free (successors);
1952 return pos;
1956 /* This page contains functions for manipulating partial-schedules during
1957 modulo scheduling. */
1959 /* Create a partial schedule and allocate a memory to hold II rows. */
1961 static partial_schedule_ptr
1962 create_partial_schedule (int ii, ddg_ptr g, int history)
1964 partial_schedule_ptr ps = XNEW (struct partial_schedule);
1965 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1966 ps->ii = ii;
1967 ps->history = history;
1968 ps->min_cycle = INT_MAX;
1969 ps->max_cycle = INT_MIN;
1970 ps->g = g;
1972 return ps;
1975 /* Free the PS_INSNs in rows array of the given partial schedule.
1976 ??? Consider caching the PS_INSN's. */
1977 static void
1978 free_ps_insns (partial_schedule_ptr ps)
1980 int i;
1982 for (i = 0; i < ps->ii; i++)
1984 while (ps->rows[i])
1986 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1988 free (ps->rows[i]);
1989 ps->rows[i] = ps_insn;
1991 ps->rows[i] = NULL;
1995 /* Free all the memory allocated to the partial schedule. */
1997 static void
1998 free_partial_schedule (partial_schedule_ptr ps)
2000 if (!ps)
2001 return;
2002 free_ps_insns (ps);
2003 free (ps->rows);
2004 free (ps);
2007 /* Clear the rows array with its PS_INSNs, and create a new one with
2008 NEW_II rows. */
2010 static void
2011 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2013 if (!ps)
2014 return;
2015 free_ps_insns (ps);
2016 if (new_ii == ps->ii)
2017 return;
2018 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2019 * sizeof (ps_insn_ptr));
2020 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2021 ps->ii = new_ii;
2022 ps->min_cycle = INT_MAX;
2023 ps->max_cycle = INT_MIN;
2026 /* Prints the partial schedule as an ii rows array, for each rows
2027 print the ids of the insns in it. */
2028 void
2029 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2031 int i;
2033 for (i = 0; i < ps->ii; i++)
2035 ps_insn_ptr ps_i = ps->rows[i];
2037 fprintf (dump, "\n[CYCLE %d ]: ", i);
2038 while (ps_i)
2040 fprintf (dump, "%d, ",
2041 INSN_UID (ps_i->node->insn));
2042 ps_i = ps_i->next_in_row;
2047 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2048 static ps_insn_ptr
2049 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2051 ps_insn_ptr ps_i = XNEW (struct ps_insn);
2053 ps_i->node = node;
2054 ps_i->next_in_row = NULL;
2055 ps_i->prev_in_row = NULL;
2056 ps_i->row_rest_count = rest_count;
2057 ps_i->cycle = cycle;
2059 return ps_i;
2063 /* Removes the given PS_INSN from the partial schedule. Returns false if the
2064 node is not found in the partial schedule, else returns true. */
2065 static bool
2066 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2068 int row;
2070 if (!ps || !ps_i)
2071 return false;
2073 row = SMODULO (ps_i->cycle, ps->ii);
2074 if (! ps_i->prev_in_row)
2076 if (ps_i != ps->rows[row])
2077 return false;
2079 ps->rows[row] = ps_i->next_in_row;
2080 if (ps->rows[row])
2081 ps->rows[row]->prev_in_row = NULL;
2083 else
2085 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2086 if (ps_i->next_in_row)
2087 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2089 free (ps_i);
2090 return true;
2093 /* Unlike what literature describes for modulo scheduling (which focuses
2094 on VLIW machines) the order of the instructions inside a cycle is
2095 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2096 where the current instruction should go relative to the already
2097 scheduled instructions in the given cycle. Go over these
2098 instructions and find the first possible column to put it in. */
2099 static bool
2100 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2101 sbitmap must_precede, sbitmap must_follow)
2103 ps_insn_ptr next_ps_i;
2104 ps_insn_ptr first_must_follow = NULL;
2105 ps_insn_ptr last_must_precede = NULL;
2106 int row;
2108 if (! ps_i)
2109 return false;
2111 row = SMODULO (ps_i->cycle, ps->ii);
2113 /* Find the first must follow and the last must precede
2114 and insert the node immediately after the must precede
2115 but make sure that it there is no must follow after it. */
2116 for (next_ps_i = ps->rows[row];
2117 next_ps_i;
2118 next_ps_i = next_ps_i->next_in_row)
2120 if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2121 && ! first_must_follow)
2122 first_must_follow = next_ps_i;
2123 if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2125 /* If we have already met a node that must follow, then
2126 there is no possible column. */
2127 if (first_must_follow)
2128 return false;
2129 else
2130 last_must_precede = next_ps_i;
2134 /* Now insert the node after INSERT_AFTER_PSI. */
2136 if (! last_must_precede)
2138 ps_i->next_in_row = ps->rows[row];
2139 ps_i->prev_in_row = NULL;
2140 if (ps_i->next_in_row)
2141 ps_i->next_in_row->prev_in_row = ps_i;
2142 ps->rows[row] = ps_i;
2144 else
2146 ps_i->next_in_row = last_must_precede->next_in_row;
2147 last_must_precede->next_in_row = ps_i;
2148 ps_i->prev_in_row = last_must_precede;
2149 if (ps_i->next_in_row)
2150 ps_i->next_in_row->prev_in_row = ps_i;
2153 return true;
2156 /* Advances the PS_INSN one column in its current row; returns false
2157 in failure and true in success. Bit N is set in MUST_FOLLOW if
2158 the node with cuid N must be come after the node pointed to by
2159 PS_I when scheduled in the same cycle. */
2160 static int
2161 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2162 sbitmap must_follow)
2164 ps_insn_ptr prev, next;
2165 int row;
2166 ddg_node_ptr next_node;
2168 if (!ps || !ps_i)
2169 return false;
2171 row = SMODULO (ps_i->cycle, ps->ii);
2173 if (! ps_i->next_in_row)
2174 return false;
2176 next_node = ps_i->next_in_row->node;
2178 /* Check if next_in_row is dependent on ps_i, both having same sched
2179 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2180 if (TEST_BIT (must_follow, next_node->cuid))
2181 return false;
2183 /* Advance PS_I over its next_in_row in the doubly linked list. */
2184 prev = ps_i->prev_in_row;
2185 next = ps_i->next_in_row;
2187 if (ps_i == ps->rows[row])
2188 ps->rows[row] = next;
2190 ps_i->next_in_row = next->next_in_row;
2192 if (next->next_in_row)
2193 next->next_in_row->prev_in_row = ps_i;
2195 next->next_in_row = ps_i;
2196 ps_i->prev_in_row = next;
2198 next->prev_in_row = prev;
2199 if (prev)
2200 prev->next_in_row = next;
2202 return true;
2205 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2206 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2207 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2208 before/after (respectively) the node pointed to by PS_I when scheduled
2209 in the same cycle. */
2210 static ps_insn_ptr
2211 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2212 sbitmap must_precede, sbitmap must_follow)
2214 ps_insn_ptr ps_i;
2215 int rest_count = 1;
2216 int row = SMODULO (cycle, ps->ii);
2218 if (ps->rows[row]
2219 && ps->rows[row]->row_rest_count >= issue_rate)
2220 return NULL;
2222 if (ps->rows[row])
2223 rest_count += ps->rows[row]->row_rest_count;
2225 ps_i = create_ps_insn (node, rest_count, cycle);
2227 /* Finds and inserts PS_I according to MUST_FOLLOW and
2228 MUST_PRECEDE. */
2229 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2231 free (ps_i);
2232 return NULL;
2235 return ps_i;
2238 /* Advance time one cycle. Assumes DFA is being used. */
2239 static void
2240 advance_one_cycle (void)
2242 if (targetm.sched.dfa_pre_cycle_insn)
2243 state_transition (curr_state,
2244 targetm.sched.dfa_pre_cycle_insn ());
2246 state_transition (curr_state, NULL);
2248 if (targetm.sched.dfa_post_cycle_insn)
2249 state_transition (curr_state,
2250 targetm.sched.dfa_post_cycle_insn ());
2253 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2254 the number of cycles according to DFA that the kernel fits in,
2255 we use this to check if we done well with SMS after we add
2256 register moves. In some cases register moves overhead makes
2257 it even worse than the original loop. We want SMS to be performed
2258 when it gives less cycles after register moves are added. */
2259 static int
2260 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2262 int cycles = 0;
2263 rtx insn;
2264 int can_issue_more = issue_rate;
2266 state_reset (curr_state);
2268 for (insn = first_insn;
2269 insn != NULL_RTX && insn != last_insn;
2270 insn = NEXT_INSN (insn))
2272 if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2273 continue;
2275 /* Check if there is room for the current insn. */
2276 if (!can_issue_more || state_dead_lock_p (curr_state))
2278 cycles ++;
2279 advance_one_cycle ();
2280 can_issue_more = issue_rate;
2283 /* Update the DFA state and return with failure if the DFA found
2284 recource conflicts. */
2285 if (state_transition (curr_state, insn) >= 0)
2287 cycles ++;
2288 advance_one_cycle ();
2289 can_issue_more = issue_rate;
2292 if (targetm.sched.variable_issue)
2293 can_issue_more =
2294 targetm.sched.variable_issue (sched_dump, sched_verbose,
2295 insn, can_issue_more);
2296 /* A naked CLOBBER or USE generates no instruction, so don't
2297 let them consume issue slots. */
2298 else if (GET_CODE (PATTERN (insn)) != USE
2299 && GET_CODE (PATTERN (insn)) != CLOBBER)
2300 can_issue_more--;
2302 return cycles;
2305 /* Checks if PS has resource conflicts according to DFA, starting from
2306 FROM cycle to TO cycle; returns true if there are conflicts and false
2307 if there are no conflicts. Assumes DFA is being used. */
2308 static int
2309 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2311 int cycle;
2313 state_reset (curr_state);
2315 for (cycle = from; cycle <= to; cycle++)
2317 ps_insn_ptr crr_insn;
2318 /* Holds the remaining issue slots in the current row. */
2319 int can_issue_more = issue_rate;
2321 /* Walk through the DFA for the current row. */
2322 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2323 crr_insn;
2324 crr_insn = crr_insn->next_in_row)
2326 rtx insn = crr_insn->node->insn;
2328 if (!INSN_P (insn))
2329 continue;
2331 /* Check if there is room for the current insn. */
2332 if (!can_issue_more || state_dead_lock_p (curr_state))
2333 return true;
2335 /* Update the DFA state and return with failure if the DFA found
2336 recource conflicts. */
2337 if (state_transition (curr_state, insn) >= 0)
2338 return true;
2340 if (targetm.sched.variable_issue)
2341 can_issue_more =
2342 targetm.sched.variable_issue (sched_dump, sched_verbose,
2343 insn, can_issue_more);
2344 /* A naked CLOBBER or USE generates no instruction, so don't
2345 let them consume issue slots. */
2346 else if (GET_CODE (PATTERN (insn)) != USE
2347 && GET_CODE (PATTERN (insn)) != CLOBBER)
2348 can_issue_more--;
2351 /* Advance the DFA to the next cycle. */
2352 advance_one_cycle ();
2354 return false;
2357 /* Checks if the given node causes resource conflicts when added to PS at
2358 cycle C. If not the node is added to PS and returned; otherwise zero
2359 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2360 cuid N must be come before/after (respectively) the node pointed to by
2361 PS_I when scheduled in the same cycle. */
2362 ps_insn_ptr
2363 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2364 int c, sbitmap must_precede,
2365 sbitmap must_follow)
2367 int has_conflicts = 0;
2368 ps_insn_ptr ps_i;
2370 /* First add the node to the PS, if this succeeds check for
2371 conflicts, trying different issue slots in the same row. */
2372 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2373 return NULL; /* Failed to insert the node at the given cycle. */
2375 has_conflicts = ps_has_conflicts (ps, c, c)
2376 || (ps->history > 0
2377 && ps_has_conflicts (ps,
2378 c - ps->history,
2379 c + ps->history));
2381 /* Try different issue slots to find one that the given node can be
2382 scheduled in without conflicts. */
2383 while (has_conflicts)
2385 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2386 break;
2387 has_conflicts = ps_has_conflicts (ps, c, c)
2388 || (ps->history > 0
2389 && ps_has_conflicts (ps,
2390 c - ps->history,
2391 c + ps->history));
2394 if (has_conflicts)
2396 remove_node_from_ps (ps, ps_i);
2397 return NULL;
2400 ps->min_cycle = MIN (ps->min_cycle, c);
2401 ps->max_cycle = MAX (ps->max_cycle, c);
2402 return ps_i;
2405 /* Rotate the rows of PS such that insns scheduled at time
2406 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2407 void
2408 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2410 int i, row, backward_rotates;
2411 int last_row = ps->ii - 1;
2413 if (start_cycle == 0)
2414 return;
2416 backward_rotates = SMODULO (start_cycle, ps->ii);
2418 /* Revisit later and optimize this into a single loop. */
2419 for (i = 0; i < backward_rotates; i++)
2421 ps_insn_ptr first_row = ps->rows[0];
2423 for (row = 0; row < last_row; row++)
2424 ps->rows[row] = ps->rows[row+1];
2426 ps->rows[last_row] = first_row;
2429 ps->max_cycle -= start_cycle;
2430 ps->min_cycle -= start_cycle;
2433 /* Remove the node N from the partial schedule PS; because we restart the DFA
2434 each time we want to check for resource conflicts; this is equivalent to
2435 unscheduling the node N. */
2436 static bool
2437 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2439 ps_insn_ptr ps_i;
2440 int row = SMODULO (SCHED_TIME (n), ps->ii);
2442 if (row < 0 || row > ps->ii)
2443 return false;
2445 for (ps_i = ps->rows[row];
2446 ps_i && ps_i->node != n;
2447 ps_i = ps_i->next_in_row);
2448 if (!ps_i)
2449 return false;
2451 return remove_node_from_ps (ps, ps_i);
2453 #endif /* INSN_SCHEDULING */
2455 static bool
2456 gate_handle_sms (void)
2458 return (optimize > 0 && flag_modulo_sched);
2462 /* Run instruction scheduler. */
2463 /* Perform SMS module scheduling. */
2464 static unsigned int
2465 rest_of_handle_sms (void)
2467 #ifdef INSN_SCHEDULING
2468 basic_block bb;
2470 /* We want to be able to create new pseudos. */
2471 no_new_pseudos = 0;
2472 /* Collect loop information to be used in SMS. */
2473 cfg_layout_initialize (0);
2474 sms_schedule ();
2476 /* Update the life information, because we add pseudos. */
2477 max_regno = max_reg_num ();
2478 no_new_pseudos = 1;
2480 /* Finalize layout changes. */
2481 FOR_EACH_BB (bb)
2482 if (bb->next_bb != EXIT_BLOCK_PTR)
2483 bb->aux = bb->next_bb;
2484 cfg_layout_finalize ();
2485 free_dominance_info (CDI_DOMINATORS);
2486 #endif /* INSN_SCHEDULING */
2487 return 0;
2490 struct tree_opt_pass pass_sms =
2492 "sms", /* name */
2493 gate_handle_sms, /* gate */
2494 rest_of_handle_sms, /* execute */
2495 NULL, /* sub */
2496 NULL, /* next */
2497 0, /* static_pass_number */
2498 TV_SMS, /* tv_id */
2499 0, /* properties_required */
2500 0, /* properties_provided */
2501 0, /* properties_destroyed */
2502 TODO_dump_func, /* todo_flags_start */
2503 TODO_df_finish |
2504 TODO_dump_func |
2505 TODO_ggc_collect, /* todo_flags_finish */
2506 'm' /* letter */