1 // -----------------------------------------------------------------------
3 // Copyright 2004,2007,2008 Tommy Thorn - All Rights Reserved
5 // This program is free software; you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 // Bostom MA 02111-1307, USA; either version 2 of the License, or
9 // (at your option) any later version; incorporated herein by reference.
11 // -----------------------------------------------------------------------
15 `include "perfcounters.v"
17 module stage_X(input wire clock
19 ,input wire restart
// for synci
20 ,input wire [31:0] restart_pc
// for synci
23 ,input wire [31:0] d_instr
24 ,input wire [31:0] d_pc
25 ,input wire [31:0] d_npc
26 ,input wire [ 5:0] d_opcode
27 ,input wire [ 5:0] d_fn
28 ,input wire [ 4:0] d_rd
29 ,input wire [ 5:0] d_rs
30 ,input wire [ 5:0] d_rt
31 ,input wire [ 4:0] d_sa
32 ,input wire [31:0] d_target
33 ,input wire [ 5:0] d_wbr
34 ,input wire d_has_delay_slot
36 ,input wire [31:0] d_op1_val
37 ,input wire [31:0] d_op2_val
38 ,input wire [31:0] d_rt_val
39 ,input wire [31:0] d_simm
43 ,input wire [31:0] d_restart_pc
45 ,input wire d_load_use_hazard
48 ,input wire [ 5:0] m_wbr
50 ,output reg x_valid
= 0
51 ,output reg [31:0] x_instr
= 0 // XXX for debugging only
52 ,output reg x_is_delay_slot
= 0
53 ,output reg [31:0] x_pc
= 0
54 ,output reg [ 5:0] x_opcode
= 0
55 ,output reg [31:0] x_op1_val
= 0 // XXX
56 ,output reg [ 5:0] x_rt
= 0
57 ,output reg [31:0] x_rt_val
= 0 // for stores only
58 ,output reg [ 5:0] x_wbr
= 0
59 ,output reg [31:0] x_res
61 ,output reg x_synci
= 0
62 ,output reg [31:0] x_synci_a
= 0
64 ,output reg x_restart
= 0
65 ,output reg [31:0] x_restart_pc
= 0
66 ,output reg x_flush_D
= 0
68 ,output reg [31:0] perf_branch_hazard
= 0
69 ,input wire [31:0] perf_dcache_misses
70 ,input wire [31:0] perf_delay_slot_bubble
71 ,output reg [31:0] perf_div_hazard
= 0
72 ,input wire [31:0] perf_icache_misses
73 ,input wire [31:0] perf_io_load_busy
74 ,input wire [31:0] perf_io_store_busy
75 ,input wire [31:0] perf_load_hit_store_hazard
76 ,output reg [31:0] perf_load_use_hazard
= 0
77 ,output reg [31:0] perf_mult_hazard
= 0
78 ,input wire [47:0] perf_retired_inst
79 ,input wire [31:0] perf_sb_full
86 reg [31:0] x_op2_val
= 0;
92 /* XXX Ideally, the core frequency is a configuration variable set
93 * at the top level, but as I'm using a different platform than the
94 * one we're comparing JOP to, I hardwire it here. This isn't
95 * cheating as this is the frequency we attain on a EP1C12C6 that
96 * the JOP numbers came from, but I don't have that particular FPGA.
98 wire [31:0] perf_frequency
= 75000;
100 wire d_ops_eq
= d_op1_val
== d_op2_val
;
101 reg x_negate_op2
= 0;
103 always @(posedge clock
)
104 x_negate_op2
<= d_opcode
== `SLTI ||
105 d_opcode
== `SLTIU ||
106 d_opcode
== `REG && (d_fn == `SLT ||
112 wire [31:0] x_op2_neg
= {32{x_negate_op2
}} ^ x_op2_val
;
113 assign {x_carry_flag
,x_sum
} = x_op1_val
+ x_op2_neg
+ x_negate_op2
;
114 wire x_sign_flag
= x_sum
[31];
115 wire x_overflow_flag
= x_op1_val
[31] == x_op2_neg
[31] &&
116 x_op1_val
[31] != x_sum
[31];
117 wire [4:0] x_shift_dist
= x_fn
[2] ? x_op1_val
[4:0] : x_sa
;
119 // XXX BUG These architectural registers must live in ME or later
120 // as ME can flush the pipe rendering an update of state in EX
121 // premature. Of course this leads to headaches with forwarding and
122 // hazards on instruction depending on these... Sigh.
124 reg [63:0] mult_a
= 0, mult_3a
= 0;
125 reg [31:0] mult_b
= 0;
127 reg [31:0] mult_lo
= 0;
128 reg [31:0] mult_hi
= 0;
130 reg div_busy
= 0, div_neg_res
, div_neg_rem
;
131 reg [31:0] divisor
= 0, div_hi
= 0, div_lo
= 0;
133 reg [ 6:0] div_n
= 0;
135 reg [31:0] cp0_status
= 0, // XXX -- " --
140 reg x_has_delay_slot
= 0;
142 reg [35:0] tsc
= 0; // Free running counter
144 reg branch_event
= 0;
146 reg [31:0] x_special
= 0; // A value that can be precomputed
147 always @(posedge clock
)
149 `REG: x_special <= d_npc + 4;
150 `REGIMM: x_special <= d_npc + 4;
151 `JAL: x_special <= d_npc + 4;
154 0: x_special
<= 0; // # of processors-1
155 1: x_special
<= 4 << IC_WORD_INDEX_BITS
;
156 2: x_special
<= tsc
[35:4]; // @40 MHz 28 min before rollover
157 3: x_special
<= 1 << 4; // TSC scaling factor
160 `LUI: x_special <= {d_simm[15: 0], 16'd0};
163 `PERF_BRANCH_HAZARD: x_special <= perf_branch_hazard;
164 `PERF_DCACHE_MISSES: x_special <= perf_dcache_misses;
165 `PERF_DELAY_SLOT_BUBBLE: x_special <= perf_delay_slot_bubble;
166 `PERF_DIV_HAZARD: x_special <= perf_div_hazard;
167 `PERF_FREQUENCY: x_special <= perf_frequency;
168 `PERF_ICACHE_MISSES: x_special <= perf_icache_misses;
169 `PERF_IO_LOAD_BUSY: x_special <= perf_io_load_busy;
170 `PERF_IO_STORE_BUSY: x_special <= perf_io_store_busy;
171 `PERF_LOAD_HIT_STORE_HAZARD: x_special <= perf_load_hit_store_hazard;
172 `PERF_LOAD_USE_HAZARD: x_special <= perf_load_use_hazard;
173 `PERF_MULT_HAZARD: x_special <= perf_mult_hazard;
174 // Count 16 retired instructions. @40 MHz 1 CPI, it takes 28 min to roll over
175 `PERF_RETIRED_INST: x_special <= perf_retired_inst[35:4];
176 `PERF_SB_FULL: x_special <= perf_sb_full;
189 `SLL : x_res = x_op2_val << x_shift_dist;
190 `SRL : x_res = x_op2_val >> x_shift_dist;
191 `SRA : x_res = $signed(x_op2_val) >>> x_shift_dist;
192 `SLLV: x_res = x_op2_val << x_shift_dist;
193 `SRLV: x_res = x_op2_val >> x_shift_dist;
194 `SRAV: x_res = $signed(x_op2_val) >>> x_shift_dist;
196 `JALR: x_res = x_special;
197 // XXX BUG See the comment above with mult_lo and mult_hi
198 `MFHI: x_res = mult_hi;
199 `MFLO: x_res = mult_lo;
200 // XXX BUG Trap on overflow for ADD, ADDI and SUB
202 `ADDU: x_res = x_sum;
204 `SUBU: x_res = x_sum;
205 `AND: x_res = x_op1_val & x_op2_val;
206 `OR: x_res = x_op1_val | x_op2_val;
207 `XOR: x_res = x_op1_val ^ x_op2_val;
208 `NOR: x_res = ~(x_op1_val | x_op2_val);
209 `SLT: x_res = {{31{1'b0}}, x_sign_flag ^ x_overflow_flag};
210 `SLTU: x_res = {{31{1'b0}}, ~x_carry_flag};
211 default: x_res
= 'hX
;
213 `REGIMM: x_res = x_special;// BLTZ, BGEZ, BLTZAL, BGEZAL
214 `JAL: x_res = x_special;
215 `ADDI: x_res = x_sum;
216 `ADDIU: x_res = x_sum;
217 `SLTI: x_res = {{31{1'b0}}, x_sign_flag ^ x_overflow_flag};
218 `SLTIU: x_res = {{31{1'b0}}, ~x_carry_flag};
219 `ANDI: x_res = {16'b0, x_op1_val[15:0] & x_op2_val[15:0]};
220 `ORI: x_res = {x_op1_val[31:16], x_op1_val[15:0] | x_op2_val[15:0]};
221 `XORI: x_res = {x_op1_val[31:16], x_op1_val[15:0] ^ x_op2_val[15:0]};
222 `LUI: x_res = x_special;
224 `RDHWR: x_res = x_special;
225 `CP2: x_res = x_special;
226 default: x_res
= 'hX
;
230 always @(posedge clock
) begin
236 x_opcode
<= d_opcode
;
240 x_op1_val
<= d_op1_val
;
241 x_op2_val
<= d_op2_val
;
243 x_rt_val
<= d_rt_val
;
245 x_has_delay_slot
<= d_has_delay_slot
& d_valid
;
246 x_is_delay_slot
<= x_has_delay_slot
& x_valid
;
249 x_restart_pc
<= d_target
;
254 /* Stat counts aren't critical, so I delay them to keep them out
255 of the critical path */
257 perf_branch_hazard
<= perf_branch_hazard
+ 1;
260 //`define MULT_RADIX_4 1
262 // Radix-2 Multiplication Machine (this is not the best way to do this)
264 $display("MULT[U] %x * %x + %x", mult_a
, mult_b
, {mult_hi
,mult_lo
});
267 1: {mult_hi
,mult_lo
} <= {mult_hi
,mult_lo
} + mult_a
;
268 2: {mult_hi
,mult_lo
} <= {mult_hi
,mult_lo
} + (mult_a
<< 1);
269 3: {mult_hi
,mult_lo
} <= {mult_hi
,mult_lo
} + mult_3a
;
271 mult_a
<= mult_a
<< 2;
272 mult_3a
<= mult_3a
<< 2;
273 mult_b
<= mult_b
>> 2;
274 if (mult_b
== 0) begin
276 {mult_hi
,mult_lo
} <= 64'd0 - {mult_hi
,mult_lo
};
280 $display("MULT[U] = %x", mult_a
+ {mult_hi
,mult_lo
});
284 // Radix-2 Multiplication Machine (this is not the best way to do this)
286 $display("MULT[U] %x * %x + %x", mult_a
, mult_b
, {mult_hi
,mult_lo
});
289 {mult_hi
,mult_lo
} <= {mult_hi
,mult_lo
} + mult_a
;
290 mult_a
<= mult_a
<< 1;
291 mult_b
<= mult_b
>> 1;
292 if (mult_b
== 0) begin
294 {mult_hi
,mult_lo
} <= 64'd0 - {mult_hi
,mult_lo
};
298 $display("MULT[U] = %x", mult_a
+ {mult_hi
,mult_lo
});
303 // XXX the use of non-blocking assignments here is intentional
304 // (easier to read), but it has the unfortunate consequence of
305 // making the final negation more expensive than it should have
306 // been. Rework this.
308 {div_hi
,div_lo
} = {div_hi
,div_lo
} << 1;
309 diff
= div_hi
- divisor
;
314 div_n
<= div_n
- 1'd1;
315 end else if (div_busy
) begin
317 mult_lo
<= div_neg_res ?
-div_lo
: div_lo
; // result
318 mult_hi
<= div_neg_rem ?
-div_hi
: div_hi
; // remainder
319 $display("DIV = hi %d lo %d",
320 div_neg_rem ?
-div_hi
: div_hi
,
321 div_neg_res ?
-div_lo
: div_lo
);
329 $display("JAL: d_npc = %x", d_npc
);
331 x_restart_pc
<= d_op1_val
;
337 x_restart_pc
<= d_op1_val
;
341 // XXX BUG See the comment above with mult_lo and mult_hi
343 if ((mult_busy | div_busy
) && d_valid
) begin
346 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
349 perf_mult_hazard
<= perf_mult_hazard
+ 1;
351 perf_div_hazard
<= perf_div_hazard
+ 1;
354 if ((mult_busy | div_busy
) && d_valid
) begin
357 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
360 perf_mult_hazard
<= perf_mult_hazard
+ 1;
362 perf_div_hazard
<= perf_div_hazard
+ 1;
366 if (mult_busy | div_busy
) begin
369 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
372 perf_mult_hazard
<= perf_mult_hazard
+ 1;
374 perf_div_hazard
<= perf_div_hazard
+ 1;
376 mult_hi
<= d_op1_val
;
380 if (mult_busy | div_busy
) begin
383 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
386 perf_mult_hazard
<= perf_mult_hazard
+ 1;
388 perf_div_hazard
<= perf_div_hazard
+ 1;
390 mult_lo
<= d_op1_val
;
395 if (mult_busy | div_busy
) begin
398 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
401 perf_mult_hazard
<= perf_mult_hazard
+ 1;
403 perf_div_hazard
<= perf_div_hazard
+ 1;
407 div_lo
<= d_op1_val
[31] ?
-d_op1_val
: d_op1_val
;
408 divisor
<= d_op2_val
[31] ?
-d_op2_val
: d_op2_val
;
409 div_neg_res
<= d_op1_val
[31] ^ d_op2_val
[31];
411 // res = a/b, rem = a - b*(a/b)
412 // thus the rem sign follows a only
414 div_neg_rem
<= d_op1_val
[31];
416 $display("%05dc EX: %d / %d", $time, d_op1_val
, d_op2_val
);
421 if (mult_busy | div_busy
) begin
424 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
427 perf_mult_hazard
<= perf_mult_hazard
+ 1;
429 perf_div_hazard
<= perf_div_hazard
+ 1;
434 divisor
<= d_op2_val
;
438 $display("%05dc EX: %d /U %d", $time, d_op1_val
, d_op2_val
);
443 if (mult_busy | div_busy
) begin
446 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
449 perf_mult_hazard
<= perf_mult_hazard
+ 1;
451 perf_div_hazard
<= perf_div_hazard
+ 1;
453 $display("MULTU %x * %x", d_op1_val
, d_op2_val
);
459 mult_3a
<= 3 * d_op1_val
;
462 $display("%05dc EX: %dU * %dU", $time, d_op1_val
, d_op2_val
);
467 if (mult_busy | div_busy
) begin
470 x_restart_pc
<= d_pc
- {x_has_delay_slot
,2'd0};
473 perf_mult_hazard
<= perf_mult_hazard
+ 1;
475 perf_div_hazard
<= perf_div_hazard
+ 1;
477 $display("MULT %x * %x", d_op1_val
, d_op2_val
);
481 mult_neg
<= d_op1_val
[31] ^ d_op2_val
[31];
482 mult_a
<= d_op1_val
[31] ?
{32'd0,32'd0 - d_op1_val
} : d_op1_val
;
483 mult_3a
<= d_op1_val
[31] ?
3 * {32'd0,32'd0-d_op1_val
} : 3 * d_op1_val
;
484 mult_b
<= d_op2_val
[31] ?
32'd0 - d_op2_val
: d_op2_val
;
485 $display("%05dc EX: %d * %d", $time, d_op1_val
, d_op2_val
);
491 x_restart_pc
<= 'hBFC00380
;
493 cp0_status
[`CP0_STATUS_EXL] <= 1;
494 //cp0_cause.exc_code = EXC_BP;
496 // cp0_cause.bd = branch_delay_slot; // XXX DELAY SLOT HANDLING!
497 cp0_epc
<= d_pc
; // XXX DELAY SLOT HANDLING!
500 `REGIMM: // BLTZ, BGEZ, BLTZAL, BGEZAL
502 if (d_rt
[4:0] == `SYNCI) begin
504 x_restart_pc
<= x_restart ? restart_pc
: d_npc
;
506 $display("synci restart at %x (d_restart = %d, d_restart_pc = %x, d_npc = %x)",
507 d_restart ? d_restart_pc
: d_npc
,
508 d_restart
, d_restart_pc
, d_npc
);
510 x_synci_a
<= d_op1_val
+ d_simm
;
512 x_restart
<= d_rt
[0] ^ d_op1_val
[31];
520 `J: if (d_valid) x_restart <= 1;
523 x_restart
<= d_ops_eq
;
524 branch_event
<= d_ops_eq
;
525 $display("%05d BEQ %8x == %8x (%1d)", $time,
526 d_op1_val
, d_op2_val
, d_ops_eq
);
530 x_restart
<= ~d_ops_eq
;
531 branch_event
<= ~d_ops_eq
;
532 $display("%05d BNE %8x == %8x (%1d)", $time,
533 d_op1_val
, d_op2_val
, d_ops_eq
);
538 x_restart
<= d_op1_val
[31] || d_op1_val
== 0;
539 branch_event
<= (d_op1_val
[31] || d_op1_val
== 0);
545 x_restart
<= !d_op1_val
[31] && d_op1_val
!= 0;
546 branch_event
<= (!d_op1_val
[31] && d_op1_val
!= 0);
551 if (d_valid
&& !d_rs
[4] && 0) begin
552 if (mult_lo
== 32'h87654321
)
553 $display("TEST SUCCEEDED!");
555 $display("%05d TEST FAILED WITH %x (%1d:%8x:%8x)", $time, mult_lo
,
556 d_valid
, d_pc
, d_instr
);
557 $finish; // XXX do something more interesting for real hw.
562 $display("MTCP2 r%d <- %x (ignored)", d_rd
, d_op2_val
);
564 $display("MFCP2 r%d", d_rd
);
568 * XXX Comment out the CP0 handling for now. I want to handle
569 * that in a way that doesn't affect the performance of the
570 * regular instructions
573 `CP0: if (d_valid) begin
574 /* Two possible formats */
576 if (d_fn
== `C0_ERET) begin
577 /* Exception Return */
579 x_flush_D
<= 1; // XXX BUG? Check that ERET doesn't have a delay slot!
580 if (cp0_status
[`CP0_STATUS_ERL]) begin
581 x_restart_pc
<= cp0_errorepc
;
582 cp0_status
[`CP0_STATUS_ERL] <= 0;
584 $display("ERET ERROREPC %x", cp0_errorepc
);
587 x_restart_pc
<= cp0_epc
;
588 cp0_status
[`CP0_STATUS_EXL] <= 0;
590 $display("ERET EPC %x", cp0_epc
);
597 $display("Unhandled CP0 command %s\n",
598 d_fn
== `C0_TLBR ? "tlbr" :
599 d_fn
== `C0_TLBWI ? "tlbwi" :
600 d_fn
== `C0_TLBWR ? "tlbwr" :
601 d_fn
== `C0_TLBP ? "tlbp" :
602 d_fn
== `C0_ERET ? "eret" :
603 d_fn
== `C0_DERET ? "deret" :
604 d_fn
== `C0_WAIT ? "wait" :
610 $display("MTCP0 r%d <- %x", d_rd
, d_op2_val
);
612 $display("MFCP0 r%d", d_rd
);
614 if (d_fn
!= 0) $display("d_fn == %x", d_fn
);
617 x_wbr
<= 0; // XXX BUG?
618 // cp0regs[i.r.rd] = t;
622 cp0_status
<= d_op2_val
;
623 $display("STATUS <= %x", d_op2_val
);
627 cp0_cause
<= d_op2_val
;
628 $display("CAUSE <= %x", d_op2_val
);
632 cp0_epc
<= d_op2_val
;
633 $display("EPC <= %x", d_op2_val
);
637 cp0_errorepc
<= d_op2_val
;
638 $display("ERROREPC <= %x", d_op2_val
);
642 cp0_status.res1 = cp0_status.res2 = 0;
643 printf("Operating mode %s\n",
644 cp0_status.ksu == 0 ? "kernel" :
645 cp0_status.ksu == 1 ? "supervisor" :
646 cp0_status.ksu == 2 ? "user" : "??");
647 printf("Exception level %d\n", cp0_status.exl);
648 printf("Error level %d\n", cp0_status.erl);
649 printf("Interrupts %sabled\n", cp0_status.ie ? "en" : "dis");
653 $display("Setting an unknown CP0 register %d", d_rd
);
662 if (d_load_use_hazard
)
663 perf_load_use_hazard
<= perf_load_use_hazard
+ 1;