CORE: get perf_delay_slot_bubble out of the critial path
[yari.git] / rtl / yari-core / stage_D.v
blobc204b4fd0af6b62ace2d62fd88092bf1086fc32c
1 // -----------------------------------------------------------------------
2 //
3 // Copyright 2004,2007 Tommy Thorn - All Rights Reserved
4 //
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 // Bostom MA 02111-1307, USA; either version 2 of the License, or
9 // (at your option) any later version; incorporated herein by reference.
11 // -----------------------------------------------------------------------
13 `timescale 1ns/10ps
14 `include "asm.v"
17 The decoding stage is relatively simple. It takes an instruction
18 and summerizes the different aspects. The most important task in
19 this stage is the fetch the value of the operands from the register
20 file and to forwarded from later stages if needed.
23 module stage_D(input wire clock
25 ,input wire i_valid // 0 => ignore i_instr.
26 ,input wire [31:0] i_instr // Current instr.
27 ,input wire [31:0] i_pc // Addr of current instr
28 ,input wire [31:0] i_npc // Addr of next instr
30 // Forwarding
31 ,input wire x_valid
32 ,input wire [ 5:0] x_wbr
33 ,input wire [31:0] x_res
35 ,input wire [31:0] m_pc
36 ,input wire m_valid
37 ,input wire [ 5:0] m_wbr
38 ,input wire [31:0] m_res
40 ,output reg d_valid = 0
41 ,output reg d_illegal_instr = 0 // XXX Must trap on this
42 ,output reg [31:0] d_pc = 0
43 ,output reg [31:0] d_instr = 0
44 ,output reg [31:0] d_npc = 0
45 ,output reg [ 5:0] d_opcode = 0
46 ,output reg [ 5:0] d_fn = 0
47 ,output reg [ 4:0] d_rd = 0
48 ,output reg [ 5:0] d_rs = 0
49 ,output reg [ 5:0] d_rt = 0
50 ,output reg [ 4:0] d_sa = 0
51 ,output reg [31:0] d_target = 0
52 ,output reg [ 5:0] d_wbr = 0
53 ,output reg d_has_delay_slot= 0
55 ,output wire [31:0] d_op1_val // aka d_rs_val
56 ,output wire [31:0] d_op2_val
57 ,output wire [31:0] d_rt_val // For stores
58 ,output reg [31:0] d_simm = 0
60 ,output reg d_restart = 0
61 ,output reg [31:0] d_restart_pc = 0
62 ,output reg d_flush_X = 0
64 ,input wire flush_D
65 ,output reg [31:0] perf_delay_slot_bubble = 0
66 ,output reg [47:0] perf_retired_inst = 0
69 parameter debug = 0;
71 // Name various instruction fields
72 wire [ 5:0] i_opcode;
73 wire [ 5:0] i_rs, i_rt;
74 wire [ 4:0] i__rs, i__rt, i_rd;
75 wire [ 4:0] i_sa;
76 wire [ 5:0] i_fn;
78 assign {i_opcode,i__rs,i__rt,i_rd,i_sa,i_fn} = i_instr;
79 assign i_rs = {1'b1,i__rs}; // Bit 5 means valid.
80 assign i_rt = {1'b1,i__rt};
82 wire [25:0] i_offset = i_instr[25:0];
84 // Sign-extend immediate field
85 wire [31:0] i_simm = {{16{i_instr[15]}}, i_instr[15:0]};
87 wire [31:0] i_branch_target = i_npc + {i_simm[29:0], 2'h0};
88 wire [31:0] i_jump_target = {i_npc[31:28],i_offset,2'h0};
90 always @(posedge clock) d_simm <= i_simm;
92 reg d_op2_is_imm = 0;
93 always @(posedge clock)
94 d_op2_is_imm <= (i_opcode[5:3] == 1 || /* Immediate instructions */
95 i_opcode[5:3] == 4 || /* Loads */
96 i_opcode[5:3] == 5); /* Stores */
98 // Register file
99 reg [31:0] regs_A [31:0]; // Initialization is handled below
100 reg [31:0] regs_B [31:0]; // Initialization is handled below
102 reg [31:0] rs_reg_val, rt_reg_val;
104 always @(posedge clock) begin
105 if (m_valid & m_wbr[5])
106 regs_A[m_wbr[4:0]] <= m_res;
107 rs_reg_val <= regs_A[i_rs[4:0]];
110 always @(posedge clock) begin
111 if (m_valid & m_wbr[5])
112 regs_B[m_wbr[4:0]] <= m_res;
113 rt_reg_val <= regs_B[i_rt[4:0]];
116 // Stage WB is only present here for bypass
117 reg w_valid = 0;
118 reg [31:0] w_res = 0;
119 reg [ 5:0] w_wbr = 0;
120 always @(posedge clock) begin
121 w_valid <= m_valid;
122 w_wbr <= m_wbr;
123 w_res <= m_res;
126 wire d_forward_x_to_s = x_valid && d_rs == x_wbr;
127 wire d_forward_x_to_t = x_valid && d_rt == x_wbr;
128 wire d_forward_m_to_s = m_valid && d_rs == m_wbr;
129 wire d_forward_m_to_t = m_valid && d_rt == m_wbr;
130 wire d_forward_w_to_s = w_valid && d_rs == w_wbr;
131 wire d_forward_w_to_t = w_valid && d_rt == w_wbr;
133 assign d_op1_val = (d_forward_x_to_s ? x_res :
134 d_forward_m_to_s ? m_res :
135 d_forward_w_to_s ? w_res :
136 rs_reg_val);
137 assign d_rt_val = (d_forward_x_to_t ? x_res :
138 d_forward_m_to_t ? m_res :
139 d_forward_w_to_t ? w_res :
140 rt_reg_val);
142 // XXX PERF This is immensely stupid; all opcodes know whether they
143 // want the immediate or the register themself, so reduce the cost
144 // of d_op2_val by not letting it cover d_simm and adjust all users
145 // of d_op2_val. The only drawback slightly less sharing for a few
146 // instructions. (And while there rename op1 -> rs, op2 -> rt).
147 assign d_op2_val = d_op2_is_imm ? d_simm : d_rt_val;
149 always @(posedge clock) begin
150 d_valid <= i_valid;
151 d_pc <= i_pc;
152 d_npc <= i_npc;
153 d_instr <= i_instr; // XXX Just for debugging
154 {d_opcode,d_rs,d_rt,d_rd,d_sa,d_fn} <= {i_opcode,i_rs,i_rt,i_rd,i_sa,i_fn};
155 d_restart <= 0;
156 d_flush_X <= 0;
158 // Determine write-back register. We set this to 0 for
159 // unrecognized instructions so avoid unintended effects. (Valid
160 // registers are remapped to 32 - 63 to avoid having to make a
161 // special case for r0 in the bypass network.
162 case (i_opcode[5:3])
163 0: case (i_opcode[2:0])
164 `REG: d_wbr <= {|i_rd[4:0],i_rd[4:0]};
165 `REGIMM:
166 if (i__rt == `SYNCI)
167 d_wbr <= 0;
168 else
169 d_wbr <= {6{i_rt[4]}};// d_rt == `BLTZAL || d_rt == `BGEZAL ? 31 : 0;
170 `JAL: d_wbr <= 6'd32+6'd31;
171 default: d_wbr <= 0; // no writeback
172 endcase
173 1: d_wbr <= {|i_rt[4:0],i_rt[4:0]}; // Immediate instructions
174 2: if ((i_opcode == `CP0 || i_opcode == `CP2) && ~i_rs[4] && ~i_rs[2])
175 d_wbr <= {|i_rt[4:0],i_rt[4:0]}; // MTCP0
176 else
177 d_wbr <= 0;
178 3: if (i_opcode == `RDHWR)
179 d_wbr <= {|i_rt[4:0],i_rt[4:0]}; // RDHWR
180 4: d_wbr <= {|i_rt[4:0],i_rt[4:0]}; // Loads
181 default: d_wbr <= 0;
182 endcase
184 // Calculate branch targets
185 case (i_opcode)
186 `REGIMM: d_target <= i_branch_target;
187 `BEQ: d_target <= i_branch_target;
188 `BNE: d_target <= i_branch_target;
189 `BLEZ: d_target <= i_branch_target;
190 `BGTZ: d_target <= i_branch_target;
191 `JAL: d_target <= i_jump_target;
192 `J: d_target <= i_jump_target;
193 default: d_target <= debug ? i_pc : 32'hxxxxxxxx;
194 endcase
196 // Detect control transfers
197 d_has_delay_slot <= 0;
198 case (i_opcode)
199 `REG: case (i_fn)
200 `JALR: d_has_delay_slot <= 1;
201 `JR: d_has_delay_slot <= 1;
202 endcase
203 `REGIMM: d_has_delay_slot <= 1;
204 `BEQ: d_has_delay_slot <= 1;
205 `BNE: d_has_delay_slot <= 1;
206 `BLEZ: d_has_delay_slot <= 1;
207 `BGTZ: d_has_delay_slot <= 1;
208 `JAL: d_has_delay_slot <= 1;
209 `J: d_has_delay_slot <= 1;
210 endcase
212 // We use d_illegal_instr to mark the instructions that we don't support
213 case (i_opcode)
214 `REG: case (i_fn)
215 `SLL: d_illegal_instr <= 0;
216 `SRL: d_illegal_instr <= 0;
217 `SRA: d_illegal_instr <= 0;
218 `SLLV: d_illegal_instr <= 0;
219 `SRLV: d_illegal_instr <= 0;
220 `SRAV: d_illegal_instr <= 0;
221 `JALR: d_illegal_instr <= 0;
222 `JR: d_illegal_instr <= 0;
223 `MFHI: d_illegal_instr <= 1;
224 `MTHI: d_illegal_instr <= 1;
225 `MFLO: d_illegal_instr <= 1;
226 `MTLO: d_illegal_instr <= 1;
227 `MULT: d_illegal_instr <= 1;
228 `MULTU: d_illegal_instr <= 1;
229 `DIV: d_illegal_instr <= 1;
230 `DIVU: d_illegal_instr <= 1;
231 `ADD: d_illegal_instr <= 1;
232 `ADDU: d_illegal_instr <= 0;
233 `SUB: d_illegal_instr <= 1;
234 `SUBU: d_illegal_instr <= 0;
235 `AND: d_illegal_instr <= 0;
236 `OR: d_illegal_instr <= 0;
237 `XOR: d_illegal_instr <= 0;
238 `NOR: d_illegal_instr <= 0;
239 `SLT: d_illegal_instr <= 0;
240 `SLTU: d_illegal_instr <= 0;
241 default: d_illegal_instr <= 1;
242 endcase
243 `REGIMM: d_illegal_instr <= 0;
244 `JAL: d_illegal_instr <= 0;
245 `J: d_illegal_instr <= 0;
246 `BEQ: d_illegal_instr <= 0;
247 `BNE: d_illegal_instr <= 0;
248 `BLEZ: d_illegal_instr <= 0;
249 `BGTZ: d_illegal_instr <= 0;
250 `ADDI: d_illegal_instr <= 1;
251 `ADDIU: d_illegal_instr <= 0;
252 `SLTI: d_illegal_instr <= 0;
253 `SLTIU: d_illegal_instr <= 0;
254 `ANDI: d_illegal_instr <= 0;
255 `ORI: d_illegal_instr <= 0;
256 `XORI: d_illegal_instr <= 0;
257 `LUI: d_illegal_instr <= 0;
258 `CP0: d_illegal_instr <= 0; // Supported == ignored
259 // `CP1:
260 `CP2: d_illegal_instr <= 0; // Supported == ignored
261 // `BBQL:
262 `LB: d_illegal_instr <= 0;
263 `LBU: d_illegal_instr <= 0;
264 `LH: d_illegal_instr <= 0;
265 `LHU: d_illegal_instr <= 0;
266 `LW: d_illegal_instr <= 0;
267 `SB: d_illegal_instr <= 0;
268 `SH: d_illegal_instr <= 0;
269 `SW: d_illegal_instr <= 0;
270 default: d_illegal_instr <= 1;
271 endcase
274 * Detect and restart upon delay-slot bubbles. The cost of this
275 * is completely hidden by the I$ fill overhead. The guarantee
276 * that delay slots always follow immediately after their
277 * preceeding instruction make correct handling of delayed
278 * branches/jumps much simpler.
280 * Note, we are making an assumption on the I$ behaviour here,
281 * that is, we're assuming it will (eventually) always be
282 * possible to hit the two consecutive lines without a miss.
283 * All normal caches behaves this way (even directly mapped),
284 * but it may be possible to construct an odd cache that
285 * doesn't have this property.
288 if (d_valid & ~flush_D & d_has_delay_slot & ~i_valid) begin
289 $display("%05d *** Taken-branch w/bubble delay slot, restarting branch at %8x",
290 $time, d_pc);
291 d_valid <= 0;
292 d_restart <= 1;
293 d_restart_pc <= d_pc;
294 d_flush_X <= 1;
297 // Delay the count one cycle to improve cycle time
298 if (d_restart)
299 perf_delay_slot_bubble <= perf_delay_slot_bubble + 1;
301 if (m_valid)
302 perf_retired_inst <= perf_retired_inst + 1;
308 `ifdef SIMULATE_MAIN
309 always @(posedge clock) begin
310 if (0)
311 $display("%05d d_op1_val (r%1d) %8x d_op2_is_imm %1d ? d_simm %1d : d_rt_val (r%1d) %8x (non fwd %8x %8x) (d_forward_x_to_t %1d %1d %1d %1d)", $time,
312 d_rs[4:0], d_op1_val,
313 d_op2_is_imm, d_simm, d_rt[4:0], d_rt_val,
314 rs_reg_val, rt_reg_val,
316 d_forward_x_to_t,
317 x_valid, i_rt, x_wbr
320 if (0) begin
321 if (d_forward_x_to_s)
322 $display("%05d DE %8x: rs (r%1d) <- EX (%8x)", $time,
323 d_pc, x_wbr - 32, x_res);
324 else if (d_forward_m_to_s)
325 $display("%05d DE %8x: rs (r%1d) <- ME (%8x)", $time,
326 d_pc, m_wbr - 32, m_res);
328 if (d_forward_x_to_t)
329 $display("%05d DE %8x: rt (r%1d) <- EX (%8x)", $time,
330 d_pc, x_wbr - 32, x_res);
331 else if (d_forward_m_to_t)
332 $display("%05d DE %8x: rt (r%1d) <- ME (%8x)", $time,
333 d_pc, m_wbr - 32, m_res);
336 // !!CAREFUL!! This line is being matched by the cosimulation,
337 // so if anything is changed, then run_simple.c:get_rtl_commit()
338 // must be adjusted accordingly.
339 if (m_valid & m_wbr[5])
340 $display("%05d COMMIT %8x:r%02d <- %8x",
341 $time, m_pc, m_wbr[4:0], m_res);
343 if (debug) begin
344 $display("%5db DE: instr %8x valid %d (m_wbr:%2x) (i_npc %8x i_offset*4 %8x target %8x)",
345 $time, d_instr, d_valid, m_wbr,
346 i_npc, i_offset << 2, i_jump_target);
348 if (0)
349 $display("%5db DE: %x %x %x %x %x %x %x %x %x %x %x %x ", $time,
350 regs[0], regs[1], regs[2], regs[3],
351 regs[4], regs[5], regs[6], regs[7],
352 regs[8], regs[9], regs[10], regs[11]);
353 if (1)
354 $display("%5db DE: %x %x %x", $time,
355 d_target, i_branch_target, i_jump_target);
359 wire [31:0] debug_regs_rs = regs_A[i_rs[4:0]];
360 wire [31:0] debug_regs_rt = regs_B[i_rt[4:0]];
361 reg [31:0] i;
362 initial
363 for (i = 0; i < 32; i = i + 1) begin
364 regs_A[i] = 0;
365 regs_B[i] = 0;
367 `endif
368 endmodule