2 Copyright (C) 1996-1997 Id Software, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License
6 as published by the Free Software Foundation; either version 2
7 of the License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 See the GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 // x86 assembly-language horizontal
8-bpp span-drawing code.
32 //----------------------------------------------------------------------
33 // 8-bpp horizontal span drawing code for polygons
, with no transparency.
35 // Assumes there is at least one span in pspans
, and that every span
36 // contains at least one pixel
37 //----------------------------------------------------------------------
41 // out-of-line
, rarely-needed clamping code
44 movl C
(bbextents
),%esi
52 movl C
(bbextentt
),%edx
63 movl C
(bbextents
),%ebp
70 movl C
(bbextentt
),%ecx
77 movl C
(bbextents
),%eax
84 movl C
(bbextentt
),%ebx
91 .globl C(D_DrawSpans8)
93 pushl
%ebp
// preserve caller
's stack frame
95 pushl %esi // preserve register variables
99 // set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
100 // and span list pointers
102 // TODO: any overlap from rearranging?
105 movl C(cacheblock),%edx
108 movl pspans(%esp),%ebx // point to the first span descriptor
111 movl %edx,pbase // pbase = cacheblock
118 // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
119 // initial s and t values
121 // FIXME: pipeline FILD?
122 fildl espan_t_v(%ebx)
123 fildl espan_t_u(%ebx)
125 fld %st(1) // dv | du | dv
126 fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
127 fld %st(1) // du | dv*d_sdivzstepv | du | dv
128 fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
129 fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
130 fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
131 // dv*d_sdivzstepv | du | dv
132 fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
133 // dv*d_sdivzstepv | du | dv
134 faddp %st(0),%st(2) // du*d_tdivzstepu |
135 // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
136 fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
137 // du*d_tdivzstepu | du | dv
138 fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
139 // du*d_tdivzstepu | du | dv
140 fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
141 // du*d_sdivzstepu + dv*d_sdivzstepv |
142 // du*d_tdivzstepu | du | dv
143 fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
144 // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
145 fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
146 // du*d_sdivzstepu; stays in %st(2) at end
147 fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
149 fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
150 // du*d_tdivzstepu | du | s/z
151 fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
152 // du*d_tdivzstepu | du | s/z
153 faddp %st(0),%st(2) // dv*d_zistepv |
154 // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
155 fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
156 // dv*d_zistepv | s/z
157 fmuls C(d_zistepu) // du*d_zistepu |
158 // dv*d_tdivzstepv + du*d_tdivzstepu |
159 // dv*d_zistepv | s/z
160 fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
161 // du*d_zistepu | dv*d_zistepv | s/z
162 fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
163 // du*d_tdivzstepu; stays in %st(1) at end
164 fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
165 faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
167 flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
168 fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
169 fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
170 // du*d_zistepu; stays in %st(0) at end
171 // 1/z | fp_64k | t/z | s/z
173 // calculate and clamp s & t
175 fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
178 // point %edi to the first pixel in the span
180 movl C(d_viewbuffer),%ecx
181 movl espan_t_v(%ebx),%eax
182 movl %ebx,pspantemp // preserve spans pointer
186 movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
188 movl espan_t_u(%ebx),%ecx
189 addl %ecx,%edi // pdest = &pdestspan[scans->u];
190 movl espan_t_count(%ebx),%ecx
193 // now start the FDIV for the end of the span
199 jz LCleanup1 // if only one pixel, no need to start an FDIV
200 movl %ecx,spancountminus1
202 // finish up the s and t calcs
203 fxch %st(1) // z*64k | 1/z | t/z | s/z
205 fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
206 fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
207 fxch %st(1) // z*64k | s | 1/z | t/z | s/z
208 fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
209 fxch %st(1) // s | t | 1/z | t/z | s/z
210 fistpl s // 1/z | t | t/z | s/z
211 fistpl t // 1/z | t/z | s/z
213 fildl spancountminus1
215 flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
216 flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
217 fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
218 fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
219 fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
220 fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
221 fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
222 // C(d_tdivzstepu)*scm1
223 fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
224 // C(d_tdivzstepu)*scm1
225 faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
226 fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
227 faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
231 fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
236 // finish up the s
and t calcs
237 fxch
%st(1) // z
*64k |
1/z |
t/z | s
/z
239 fld
%st(0) // z
*64k | z
*64k |
1/z |
t/z | s
/z
240 fmul %st(4),%st(0) // s | z
*64k |
1/z |
t/z | s
/z
241 fxch
%st(1) // z
*64k | s |
1/z |
t/z | s
/z
242 fmul %st(3),%st(0) // t | s |
1/z |
t/z | s
/z
243 fxch
%st(1) // s |
t |
1/z |
t/z | s
/z
244 fistpl s
// 1/z |
t |
t/z | s
/z
245 fistpl
t // 1/z |
t/z | s
/z
250 // finish up the s
and t calcs
251 fxch
%st(1) // z
*64k |
1/z |
t/z | s
/z
253 fld
%st(0) // z
*64k | z
*64k |
1/z |
t/z | s
/z
254 fmul %st(4),%st(0) // s | z
*64k |
1/z |
t/z | s
/z
255 fxch
%st(1) // z
*64k | s |
1/z |
t/z | s
/z
256 fmul %st(3),%st(0) // t | s |
1/z |
t/z | s
/z
257 fxch
%st(1) // s |
t |
1/z |
t/z | s
/z
258 fistpl s
// 1/z |
t |
t/z | s
/z
259 fistpl
t // 1/z |
t/z | s
/z
268 fdiv %st(1),%st(0) // z
= 1/1/z
269 // this is what we
've gone to all this trouble to
275 movl C(bbextents),%ebx
276 movl C(bbextentt),%ebp
288 movl s,%esi // sfrac = scans->sfrac;
290 movl t,%eax // tfrac = scans->tfrac;
295 // calculate the texture starting address
298 movl C(cachewidth),%edx
299 imull %edx,%eax // (tfrac >> 16) * cachewidth
301 addl %eax,%esi // psource = pbase + (sfrac >> 16) +
302 // ((tfrac >> 16) * cachewidth);
305 // determine whether last span or not
311 // not the last segment; do full 8-wide segment
316 // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
320 // pick up after the FDIV that was left in flight previously
322 fld %st(0) // duplicate it
323 fmul %st(4),%st(0) // s = s/z * z
325 fmul %st(3),%st(0) // t = t/z * z
332 movb (%esi),%bl // get first source texel
333 subl $8,%ecx // count off this segments' pixels
335 movl
%ecx
,counttemp
// remember count of remaining pixels
338 movb
%bl,(%edi
) // store first dest pixel
343 movl C
(bbextents
),%eax
344 movl C
(bbextentt
),%edx
365 // set up advancetable
369 sarl $
19,%eax
// tstep
>>= 16;
371 sarl $
19,%edx
// sstep
>>= 16;
372 movl C
(cachewidth
),%ebx
377 sarl $
19,%edx
// sstep
>>= 16;
378 movl C
(cachewidth
),%ebx
382 addl
%edx
,%eax
// add in sstep
383 // (tstep
>> 16) * cachewidth
+ (sstep
>> 16);
385 movl
%eax
,advancetable+
4 // advance base in
t
386 addl
%ebx
,%eax
// ((tstep
>> 16) + 1) * cachewidth
+
388 shll $
13,%ebp
// left-justify sstep fractional part
390 shll $
13,%ecx
// left-justify tstep fractional part
391 movl
%eax
,advancetable
// advance extra in
t
394 addl
%ecx
,%edx
// advance tfrac fractional part by tstep frac
396 sbbl
%ecx
,%ecx
// turn tstep carry into
-1 (0 if none
)
397 addl
%ebp
,%ebx
// advance sfrac fractional part by sstep frac
398 adcl advancetable+
4(,%ecx
,4),%esi
// point to next source texel
405 adcl advancetable+
4(,%ecx
,4),%esi
411 adcl advancetable+
4(,%ecx
,4),%esi
418 adcl advancetable+
4(,%ecx
,4),%esi
425 adcl advancetable+
4(,%ecx
,4),%esi
429 // start
FDIV for end of next segment in flight
, so it can overlap
432 cmpl $
8,%ecx
// more than one segment after this?
433 ja LSetupNotLast2
// yes
436 jz LFDIVInFlight2
// if only one pixel
, no need to start an
FDIV
437 movl
%ecx
,spancountminus1
438 fildl spancountminus1
440 flds C
(d_zistepu
) // C
(d_zistepu
) | spancountminus1
441 fmul %st(1),%st(0) // C
(d_zistepu
)*scm1 | scm1
442 flds C
(d_tdivzstepu
) // C
(d_tdivzstepu
) | C
(d_zistepu
)*scm1 | scm1
443 fmul %st(2),%st(0) // C
(d_tdivzstepu
)*scm1 | C
(d_zistepu
)*scm1 | scm1
444 fxch
%st(1) // C
(d_zistepu
)*scm1 | C
(d_tdivzstepu
)*scm1 | scm1
445 faddp
%st(0),%st(3) // C
(d_tdivzstepu
)*scm1 | scm1
446 fxch
%st(1) // scm1 | C
(d_tdivzstepu
)*scm1
447 fmuls C
(d_sdivzstepu
) // C
(d_sdivzstepu
)*scm1 | C
(d_tdivzstepu
)*scm1
448 fxch
%st(1) // C
(d_tdivzstepu
)*scm1 | C
(d_sdivzstepu
)*scm1
449 faddp
%st(0),%st(3) // C
(d_sdivzstepu
)*scm1
450 flds fp_64k
// 64k | C
(d_sdivzstepu
)*scm1
451 fxch
%st(1) // C
(d_sdivzstepu
)*scm1 |
64k
452 faddp
%st(0),%st(4) // 64k
454 fdiv %st(1),%st(0) // this is what we
've gone to all this trouble to
467 fdiv %st(1),%st(0) // z = 1/1/z
468 // this is what we've gone to all this trouble to
478 adcl advancetable+
4(,%ecx
,4),%esi
485 adcl advancetable+
4(,%ecx
,4),%esi
492 adcl advancetable+
4(,%ecx
,4),%esi
502 movl counttemp
,%ecx
// retrieve count
505 // determine whether last span
or not
507 cmpl $
8,%ecx
// are there multiple segments remaining?
509 ja LNotLastSegment
// yes
512 // last segment of scan
517 // advance s
/z
, t/z
, and 1/z
, and calculate s
& t at end of span
and steps to
518 // get there. The number of pixels left is variable
, and we want to land on the
519 // last pixel
, not step one past it
, so we can
't run into arithmetic problems
522 jz LNoSteps // just draw the last pixel and we're done
524 // pick up after the
FDIV that was left in flight previously
527 fld
%st(0) // duplicate it
528 fmul %st(4),%st(0) // s
= s
/z
* z
530 fmul %st(3),%st(0) // t = t/z
* z
535 movb
(%esi
),%al
// load first texel in segment
537 movb
%al
,(%edi
) // store first pixel in segment
543 movl C
(bbextents
),%ebp
544 movl C
(bbextentt
),%edx
559 cmpl $
1,%ecx
// don
't bother
560 je LOnlyOneStep // if two pixels in segment, there's only one step
,
561 // of the segment length
565 addl
%eax
,%eax
// convert to
15.17 format so multiply by
1.31
566 addl
%ebx
,%ebx
// reciprocal yields
16.48
568 imull reciprocal_table-
8(,%ecx
,4) // sstep
= (snext
- s
) / (spancount-
1)
572 imull reciprocal_table-
8(,%ecx
,4) // tstep
= (tnext
- t) / (spancount-
1)
576 // set up advancetable
578 movl entryvec_table
(,%ecx
,4),%ebx
580 movl
%ebx
,jumptemp
// entry point into code for RET later
582 sarl $
16,%edx
// tstep
>>= 16;
583 movl C
(cachewidth
),%ebx
584 sarl $
16,%ecx
// sstep
>>= 16;
587 addl
%ecx
,%edx
// add in sstep
588 // (tstep
>> 16) * cachewidth
+ (sstep
>> 16);
590 movl
%edx
,advancetable+
4 // advance base in
t
591 addl
%ebx
,%edx
// ((tstep
>> 16) + 1) * cachewidth
+
593 shll $
16,%ebp
// left-justify sstep fractional part
595 shll $
16,%eax
// left-justify tstep fractional part
596 movl
%edx
,advancetable
// advance extra in
t
603 adcl advancetable+
4(,%ecx
,4),%esi
605 jmp
*jumptemp
// jump to the number-of-pixels handler
607 //----------------------------------------
610 movb
(%esi
),%al
// load first texel in segment
611 subl $
7,%edi
// adjust for hardwired offset
622 //----------------------------------------
626 subl $
6,%edi
// adjust for hardwired offsets
630 //----------------------------------------
634 subl $
5,%edi
// adjust for hardwired offsets
639 adcl advancetable+
4(,%ecx
,4),%esi
642 //----------------------------------------
646 subl $
4,%edi
// adjust for hardwired offsets
651 adcl advancetable+
4(,%ecx
,4),%esi
655 //----------------------------------------
659 subl $
3,%edi
// adjust for hardwired offsets
664 adcl advancetable+
4(,%ecx
,4),%esi
668 //----------------------------------------
672 subl $
2,%edi
// adjust for hardwired offsets
677 adcl advancetable+
4(,%ecx
,4),%esi
681 //----------------------------------------
685 decl
%edi
// adjust for hardwired offsets
690 adcl advancetable+
4(,%ecx
,4),%esi
694 //----------------------------------------
702 adcl advancetable+
4(,%ecx
,4),%esi
709 adcl advancetable+
4(,%ecx
,4),%esi
716 adcl advancetable+
4(,%ecx
,4),%esi
723 adcl advancetable+
4(,%ecx
,4),%esi
730 adcl advancetable+
4(,%ecx
,4),%esi
737 adcl advancetable+
4(,%ecx
,4),%esi
746 // clear s
/z
, t/z
, 1/z from FP stack
752 movl pspantemp
,%ebx
// restore spans pointer
753 movl espan_t_pnext
(%ebx
),%ebx
// point to next span
754 testl
%ebx
,%ebx
// any more spans?
756 jnz LSpanLoop
// more spans
758 popl
%ebx
// restore register variables
761 popl
%ebp
// restore the caller
's stack frame
764 //----------------------------------------------------------------------
765 // 8-bpp horizontal span z drawing codefor polygons, with no transparency.
767 // Assumes there is at least one span in pzspans, and that every span
768 // contains at least one pixel
769 //----------------------------------------------------------------------
773 // z-clamp on a non-negative gradient span
775 movl $0x40000000,%edx
780 // z-clamp on a negative gradient span
782 movl $0x40000000,%edx
790 .globl C(D_DrawZSpans)
792 pushl %ebp // preserve caller's stack frame
794 pushl
%esi
// preserve register variables
798 movl C
(d_zistepu
),%eax
799 movl pzspans
(%esp
),%esi
803 fmuls Float2ToThe31nd
804 fistpl izistep
// note
: we are relying on FP exceptions being turned
805 // off here to avoid range problems
806 movl izistep
,%ebx
// remains loaded for all spans
809 // set up the initial
1/z value
810 fildl espan_t_v
(%esi
)
811 fildl espan_t_u
(%esi
)
812 movl espan_t_v
(%esi
),%ecx
813 movl C
(d_pzbuffer
),%edi
819 imull C
(d_zrowbytes
),%ecx
822 // clamp if z is nearer than
2 (1/z
> 0.5)
825 movl espan_t_u
(%esi
),%edx
826 addl
%edx
,%edx
// word count
827 movl espan_t_count
(%esi
),%ecx
828 addl
%edx
,%edi
// pdest
= &pdestspan
[scans-
>u
];
829 pushl
%esi
// preserve spans pointer
834 fmuls Float2ToThe31nd
835 fistpl izi
// note
: we are relying on FP exceptions being turned
836 // off here to avoid problems when the span is closer
848 // do
a single pixel up front
, if necessary to dword align the destination
858 // do middle
a pair of aligned dwords at
a time
861 shrl $
1,%ecx
// count
/ 2
862 jz LFLast
// no aligned dwords to do
863 shrl $
1,%ecx
// (count
/ 2) / 2
864 jnc LFMiddleLoop
// even number of aligned dwords to do
871 andl $
0xFFFF0000,%esi
884 andl $
0xFFFF0000,%esi
892 andl $
0xFFFF0000,%esi
894 movl
%ebp
,4(%edi
) // FIXME
: eliminate register contention
901 popl
%ecx
// retrieve count
902 popl
%esi
// retrieve span pointer
904 // do the last
, unaligned pixel
, if there is one
905 andl $
1,%ecx
// is there an odd pixel left to do?
908 movw
%dx
,(%edi
) // do the final pixel
's z
911 movl espan_t_pnext(%esi),%esi
918 fmuls FloatMinus2ToThe31nd
919 fistpl izistep // note: we are relying on FP exceptions being turned
920 // off here to avoid range problems
921 movl izistep,%ebx // remains loaded for all spans
924 // set up the initial 1/z value
925 fildl espan_t_v(%esi)
926 fildl espan_t_u(%esi)
927 movl espan_t_v(%esi),%ecx
928 movl C(d_pzbuffer),%edi
934 imull C(d_zrowbytes),%ecx
937 // clamp if z is nearer than 2 (1/z > 0.5)
940 movl espan_t_u(%esi),%edx
941 addl %edx,%edx // word count
942 movl espan_t_count(%esi),%ecx
943 addl %edx,%edi // pdest = &pdestspan[scans->u];
944 pushl %esi // preserve spans pointer
949 fmuls Float2ToThe31nd
950 fistpl izi // note: we are relying on FP exceptions being turned
951 // off here to avoid problems when the span is closer
963 // do a single pixel up front, if necessary to dword align the destination
973 // do middle a pair of aligned dwords at a time
976 shrl $1,%ecx // count / 2
977 jz LFNegLast // no aligned dwords to do
978 shrl $1,%ecx // (count / 2) / 2
979 jnc LFNegMiddleLoop // even number of aligned dwords to do
986 andl $0xFFFF0000,%esi
999 andl $0xFFFF0000,%esi
1007 andl $0xFFFF0000,%esi
1009 movl %ebp,4(%edi) // FIXME: eliminate register contention
1016 popl %ecx // retrieve count
1017 popl %esi // retrieve span pointer
1019 // do the last, unaligned pixel, if there is one
1020 andl $1,%ecx // is there an odd pixel left to do?
1021 jz LFNegSpanDone // no
1023 movw %dx,(%edi) // do the final pixel's z
1026 movl espan_t_pnext
(%esi
),%esi
1031 popl
%ebx
// restore register variables
1034 popl
%ebp
// restore the caller
's stack frame