/*********************************************************************************************** * * Copyright © DreamWorks Interactive, 1997. * * Contents: * Specialized (non-template) versions of the DrawSubtriangle function. * Optimized for the AMD K6-3D Processor * *********************************************************************************************** * * $Log:: /JP2_PC/Source/Lib/Renderer/Primitives/AMDK6/DrawSubTriangleEx.cpp $ * * 20 98.10.03 11:10p Mmouni * Put self moding code in code segment "SelfMod". * * 19 9/05/98 12:18p Asouth * explicit scoping of 'fixed' * * 18 98.08.13 10:43p Mmouni * Changed to work with VC5.0sp3. * * 17 3/13/98 6:06p Mmouni * 16-bit transparent copy now does dword writes (if it can). * * 16 3/12/98 1:14p Mmouni * Fixed error in QWORD alignment. * * 15 2/17/98 5:42p Mmouni * Copy primitive now writes QWORD aligned QWORDS. * * 14 2/04/98 4:10p Mmouni * 16-bit copy primitive now write aligned DWORDS. * * 13 11/15/97 8:51p Mmouni * Added extra ALIGN,nop to linear filtered routine. * * 12 97.11.15 7:47p Mmouni * Optmized alignment of filtered inner loop. * * 11 97.11.15 12:01a Mmouni * Made bi-linear filter primitives seperate from non-filtered copy primitive. * * 10 97.11.11 9:49p Mmouni * Made changes to filtered copy primitives to support 555 or 565 color. * * 9 97.11.06 9:05p Mmouni * Added bi-linear filtered copy. * * 8 97.11.05 7:52p Mmouni * Added terrain fog mask. * * 7 97.11.04 10:59p Mmouni * Linear version of primitives now use 3dx/mmx. * * 6 97.10.30 1:39p Mmouni * All perspective sub-triangle routines are now 3DX/MMX. * * 5 97.10.27 1:25p Mmouni * Initial K6-3D version, copied from the Pentium versions. * * 16 97.10.22 7:11p Mmouni * Moved code that makes copies of edge values into assembly. * * 15 10/10/97 1:47p Mmouni * All inner loops are now left to right only, and optimized some. * * 14 9/30/97 9:32p Mmouni * Made seperate routines for 16-bit copy and terrain polygons. * * 13 9/29/97 11:49a Mmouni * Optimized, now has support for fogging. * * 12 9/15/97 2:10p Mmouni * Optimized inner loops (not final yet). * * 11 9/01/97 8:02p Rwyatt * * 10 8/17/97 10:00p Rwyatt * Inlined perspective for non-transparent 16bit copy. This is so its clamping can be control * outside of the general perspective code. * * 9 8/17/97 9:50p Rwyatt * changed the clamp condition to use VER_CLAMP_UV_16BIT and not the general clamp control * VER_CLAMP_UV_TILE * * 8 8/17/97 9:44p Rwyatt * Added temporary #defines to P5 clamp code. This clamp code needs to be optimized later. * * 7 8/15/97 2:33a Rwyatt * Removed local static variables, all renderer statics are now defined in DrawSubrtiangle.cpp * * 6 8/15/97 12:45a Rwyatt * Y Loop optimizarions and inlined perspective * * 5 8/01/97 6:57p Rwyatt * Linear, No Clut, Non-Transparent has been optimized for 32 bit writes. There is now only 2 16 * bit reads in the main loop and 1 32 bit write. This means that L to R and R to L scanlines * are completely different. * * 4 97/07/18 3:29p Pkeet * Removed use of the 'b_right' parameter from DrawSubTriangle. * * 3 97/07/16 15:49 Speter * Commented out redundant static delcarations now that all DrawSubTriangles are included into * one file (now just declared in DrawSubTriangleEx.cpp.) * * 2 7/14/97 12:36p Rwyatt * Checkled in so Scott can move them.... * * 1 7/07/97 5:43p Rwyatt * Initial Implementation of Pentium Specifics * *********************************************************************************************** *********************************************************************************************** * 22 6/25/97 3:28p Rwyatt * Re-aranged code * * 21 6/17/97 4:10p Rwyatt * Optimized perspective call * * 20 97/06/16 10:16p Pkeet * Moved linear primitives into non-template code. * * 19 97/06/16 8:47p Pkeet * Replaced template calls with explicit C++ code for perspective routines. * * 18 6/15/97 1:31a Bbell * Removed unnecessary include. * * 17 97/06/12 5:56p Pkeet * Added capability to render only every other scanline and to double the number of scanlines. * * 16 97/06/11 4:58p Pkeet * Added opaque versions of the optimized copy primitives. * * 15 6/11/97 4:11p Rwyatt * Removed 3 AGIs and 2 PRO stalls from the perspective primitive. These stalls were costing * about 40 clocks and the net result is 50% speed increase!!!!!!!! * * 14 6/11/97 2:45p Rwyatt * Removed 2 AGI stalls from the linear copy * * 13 6/11/97 2:04a Rwyatt * Slight;y optimized, need to run through V-Tune so we can spot any stalls * * 12 6/10/97 10:02p Rwyatt * Blazing linear transparent copy * * 11 6/10/97 4:34p Rwyatt * Slightly optimized perspective copy primitive * * 10 97/06/10 2:27p Pkeet * Moved the bumpmap routines into a seperate module. * * 9 97/06/10 1:20p Pkeet * Added initial assembly optimizations for bumpmap linear routines. * * 8 97/06/10 12:53p Pkeet * Fixed bug in bumpmap assembly. * * 7 97/06/10 10:51a Pkeet * Added initial assembly optimization passes for perspective corrected bumpmapping for both * transparent and non-transparent versions. * * 6 97/06/09 6:38p Pkeet * Algorithmic optimizations. * * 5 97/06/08 4:16p Pkeet * Added perspective correction bump map primitives. * * 4 97/06/06 8:51p Pkeet * Added linear. * * 3 97/06/06 6:44p Pkeet * Moved the inner loop to self-modifying code. * * 2 97/06/06 4:36p Pkeet * It works again. * * 1 97/06/04 6:46p Pkeet * Initial specialized version of DrawSubTriangle. * **********************************************************************************************/ #include "Common.hpp" #include "AsmSupport.hpp" #include "Lib/View/Raster.hpp" #include "Lib/Renderer/Primitives/DrawTriangle.hpp" // // Turn off the global optimized because it was re-arranging the assembly code // without account for the self-modification offsets. // #pragma optimize("g", off) // Put this stuff in a seperate section for alignment. #pragma code_seg("StriCopy") //***************************************************************************************** // // Perspective, No Clut, Transparent. // void DrawSubtriangle(TCopyPerspTrans* pscan, CDrawPolygon* pdtri) { ::fixed fx_inc; ::fixed fx_diff; TCopyPerspTrans* plinc = &pdtri->pedgeBase->lineIncrement; __asm { // Get ready to do MMX/3DX. femms // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // f_inc_uinvz = pdtri->pedgeBase->lineIncrement.indCoord.fUInvZ; // f_inc_vinvz = pdtri->pedgeBase->lineIncrement.indCoord.fVInvZ; // f_inc_invz = pdtri->pedgeBase->lineIncrement.indCoord.fInvZ; // mov eax,[plinc] mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. mov ebx,[eax]TCopyPerspTrans.fxX mov edx,[eax]TCopyPerspTrans.fxXDifference mov fx_inc,ebx movq mm0,[eax]TCopyPerspTrans.indCoord.fUInvZ mov fx_diff,edx mov ebx,[eax]TCopyPerspTrans.indCoord.fInvZ movq [tvEdgeStep.UInvZ],mm0 mov [tvEdgeStep.InvZ],ebx // // Setup: // QWORD qMFactor = width,1,width,1 // Load global texture values. // movd mm3,[iTexWidth] // 0,0,0,Width movd mm4,[u4OneOne] // 0,0,1,1 punpcklwd mm3,mm3 // 0,0,Width,Width punpcklwd mm4,mm3 // Width,1,Width,1 movq mm0,[ecx]TCopyPerspTrans.indCoord.fUInvZ // mm0 = (V/Z,U/Z) movq [qMFactor],mm4 // Save mfactor movd mm1,[ecx]TCopyPerspTrans.indCoord.fInvZ // mm1 = (?,1/Z) // // Iterate through the scanlines that intersect the subtriangle. // // esi,ecx,mm0,mm1 setup. // Y_LOOP: mov eax,[esi]CDrawPolygonBase.iY mov ebx,[bEvenScanlinesOnly] // Check to see if we sould skip this scanline. and eax,ebx jnz END_OF_SCANLINE // // Setup for prespective correction and looping. // PERSPECTIVE_SCANLINE_PROLOGUE(TCopyPerspTrans) // // // Determine the start and end of the scanline. // mov ebx,[ecx]TCopyPerspTrans.fxX.i4Fx mov eax,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,eax mov eax,[ecx]TCopyPerspTrans.fxX.i4Fx sar ebx,16 sar eax,16 add edx,ebx sub eax,ebx // eax == i_pixel jge END_OF_SCANLINE // Load span increments. movq mm2,[tvDEdge.UInvZ] // Load V,U mov [i_screen_index],edx // Save scanline index. movd mm3,[tvDEdge.InvZ] // Load Z mov [i_pixel],eax // Save negative length of scanline. mov edi,[iSubdivideLen] // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfrcp (m4,m1) // f_z = mm4 = 1/fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m3,m1) // Step fGInvZ mov edx,[bClampUV] // Load clamp flag. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated 1/Z pfmul (m4,m0) // mm4 = V1,U1 pfrcp (m0,m3) // f_next_z = mm0 = 1/fGInvZ movq [tvCurUVZ.UInvZ],mm2 // Save updated U/Z,V/Z movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight pfmax (m4,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m4,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif punpckldq mm1,mm1 // Duplicate reciprocal of span width. movq [pfCurUV],mm0 // Save ending U,V pfsub (m0,m4) // mm0 = V2-V1,U2-U1 pfmul (m4,m2) // Scale to starting U,V 16.16 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. pfmul (m0,m1) // DV*1/Width,DU*1/Width pf2id (m7,m4) // Starting V,U in mm7 sub edi,eax // edi = inner loop count mov ebx,[i_screen_index] // Load scanline offset. pfmul (m0,m2) // Scale U,V slope to 16.16 mov edx,gsGlobals.pvScreen // Pointer the screen. mov esi,[pvTextureBitmap] // Load texture base pointer. add eax,ebx // Add scanline offset to i_pixel pf2id (m6,m0) // VStep,UStep in mm6 lea edx,[edx+eax*2] // Base of span in edx. SUBDIVISION_LOOP: //--------------------------------------------------------------------------- // Draw the subdivision. // // mm6,mm7 = texture values. // // Setup registers: // // eax = texel_index2 ebx = texel_index1 // ecx = temp edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = base pointer // // mm0 = temp mm1 = temp // mm2 = mfactor mm3 = temp // mm6 = uvslope mm7 = uv (stepped once) // movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 movq mm2,[qMFactor] // Load texture multipliers psrad mm1,16 // Shift for integer U2,V2 sub esi,4 // Hack to force SIB + offset in loop packssdw mm0,mm1 // Pack integer texture values // // Check for alignment on a dword boundry. // lea eax,[edx + edi * 2] // Destination address. and eax,3 // Check for dword alignment. jz INNER_LOOP_2P inc edi // One pixel left? jz ONE_PIXEL_LEFT // // Do one pixel for alignment. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy new U2,V2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 psrad mm0,16 // Shift for integer U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 packssdw mm1,mm0 // Pack integer texture values movq mm0,mm1 // Pack new U2,V2 with old U2,V2 test ebx,ebx jz INNER_LOOP_2P mov [edx + edi*2 - 2],bx // Store pixel 1 jmp short INNER_LOOP_2P // Align start of loop to 4 past the beginning of a cache line. ALIGN 16 nop ALIGN 16 nop nop nop nop // // Do two pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_2P: add edi,2 // Inrcement index by 2 jge FINISH_REMAINDER // Loop (while >2 pixels) pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift pixel 2 up. jz P2_TRANSPARENT test ebx,ebx // Test P1 jz P1_TRANSPARENT or eax,ebx // Combine pixels. mov [edx + edi*2 - 4],eax // Store both pixels. jmp short INNER_LOOP_2P P2_TRANSPARENT: // P1 may or may not be transparent. test ebx,ebx // Test P1 jz INNER_LOOP_2P mov [edx + edi*2 - 4],bx // Store pixel 1 jmp short INNER_LOOP_2P P1_TRANSPARENT: // P2 is not transparent. shr eax,16 // Shift pixel 2 down. mov [edx + edi*2 - 2],ax // Store pixel 2 jmp short INNER_LOOP_2P FINISH_REMAINDER: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz short ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. paddd mm7,mm6 // Step U,V mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 shl eax,16 // Shift pixel 2 up. jz P2_TRANSPARENT_END test ebx,ebx // Test P1 jz P1_TRANSPARENT_END or eax,ebx // Combine pixels. mov [edx - 4],eax // Store both pixels. mov eax,[i_pixel] // Load remaining length. jmp short END_OF_SPAN P2_TRANSPARENT_END: // P1 may or may not be transparent. test ebx,ebx // Test P1 mov eax,[i_pixel] // Load remaining length. jz END_OF_SPAN mov [edx - 4],bx // Store pixel 1 jmp short END_OF_SPAN P1_TRANSPARENT_END: // P2 is not transparent. shr eax,16 // Shift pixel 2 down. mov [edx - 2],ax // Store pixel 2 mov eax,[i_pixel] // Load remaining length. jmp short END_OF_SPAN ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,[i_pixel] // Load remaining length. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 test ebx,ebx jz END_OF_SPAN mov [edx - 2],bx // Store pixel 1 END_OF_SPAN: // See if there are any pixels left. cmp eax,0 jz short END_OF_SCANLINE // // Do perspective correction and looping. // PERSPECTIVE_SCANLINE_EPILOGUE // //---------------------------------------------------------------------------------- // Implement perspective correction. // // eax = i_pixel // Preserve: mm7 // mov edi,[iSubdivideLen] // Load last texture values. movq mm0,[tvCurUVZ.UInvZ] // mm0 = (V/Z,U/Z) movd mm1,[tvCurUVZ.InvZ] // mm1 = (?,1/Z) // Load span increments. movq mm2,[tvDEdge.UInvZ] // V,U movd mm3,[tvDEdge.InvZ] // Z // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL_END // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL_END: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfadd (m3,m1) // Step fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated fGInvZ pfrcp (m0,m3) // f_z = mm0 = 1/fGInvZ movq mm4,[pfCurUV] // Load last U,V movq [tvCurUVZ.UInvZ],mm2 // Save updated fGUInvZ,fGVInvZ movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif movq [pfCurUV],mm0 // Save current U,V punpckldq mm1,mm1 // Duplicate across entire register. pfsub (m0,m4) // V2-V1,U2-U1 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. mov edx,gsGlobals.pvScreen // Pointer the screen. pfmul (m0,m1) // DV*1/Width,DU*1/Width mov ebx,[i_screen_index] // Load scanline offset. sub edi,eax // edi = inner loop count mov esi,[pvTextureBitmap] // Load texture base pointer. pfmul (m0,m2) // Scale to 16.16 add eax,ebx // Add scanline offset to i_pixel lea edx,[edx+eax*2] // Base of span in edx. pf2id (m6,m0) // VStep,UStep in mm6 jmp SUBDIVISION_LOOP END_OF_SCANLINE: // // Increment the base edge. // mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. // Step length, line starting address. mov edx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,[esi]CDrawPolygonBase.iLineStartIndex add edx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add edi,gsGlobals.u4LinePixels mov eax,[ecx]TCopyPerspTrans.fxX.i4Fx mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,edx mov ebx,[ecx]TCopyPerspTrans.fxXDifference.i4Fx mov [esi]CDrawPolygonBase.iLineStartIndex,edi // Step edge values. movq mm0,[ecx]TCopyPerspTrans.indCoord.fUInvZ movq mm2,[tvEdgeStep.UInvZ] movd mm1,[ecx]TCopyPerspTrans.indCoord.fInvZ movd mm3,[tvEdgeStep.InvZ] pfadd (m0,m2) mov edi,[fx_diff] pfadd (m1,m3) add eax,[fx_inc] add ebx,edi jge short NO_UNDERFLOW // Underflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] add ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) jmp short FINISH_LOOPING NO_UNDERFLOW: cmp ebx,0x00010000 jl short FINISH_LOOPING // Overflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] sub ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) FINISH_LOOPING: // Save values and Y Loop control. movq [ecx]TCopyPerspTrans.indCoord.fUInvZ,mm0 movd [ecx]TCopyPerspTrans.indCoord.fInvZ,mm1 mov [ecx]TCopyPerspTrans.fxX.i4Fx,eax mov edx,[esi]CDrawPolygonBase.iY mov [ecx]TCopyPerspTrans.fxXDifference.i4Fx,ebx mov ebx,[esi]CDrawPolygonBase.iYTo inc edx mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } //***************************************************************************************** // // Linear, No clut, Transparent. // //****************************** void DrawSubtriangle(TCopyLinearTrans* pscan, CDrawPolygon* pdtri) { typedef TCopyLinearTrans::TIndex prevent_internal_compiler_errors; static CDrawPolygon* pdtriGlbl; static TCopyLinearTrans* pscanGlbl; static PackedFloatUV pf_texture_inc; TCopyLinearTrans* plinc = &pdtri->pedgeBase->lineIncrement; __asm { // Get ready to do MMX/3DX. femms mov edi,[plinc] // Pointer to scanline increment. mov esi,[pdtri] // Pointer to polygon object. mov eax,[pscan] // Pointer to scanline object. mov pdtriGlbl,esi // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // pf_texture_inc = pdtri->pedgeBase->lineIncrement.indCoord.pfIndex; // pdtriGlbl = pdtri; // pscanGlbl = pscan; // fx_gour_inc = pdtri->pedgeBase->lineIncrement.gourIntensity.fxIntensity; // mov ebx,[edi]TCopyLinearTrans.fxX mov pscanGlbl,eax mov ecx,[edi]TCopyLinearTrans.fxXDifference mov fx_inc,ebx movq mm0,[edi]TCopyLinearTrans.indCoord.pfIndex mov fx_diff,ecx movq [pf_texture_inc],mm0 // // Setup: // // mm2 = qMFactor = width,1,width,1 // mm3 = qUVMasks // mm6 = uvslope // movd mm1,[iTexWidth] // 0,0,0,Width movd mm2,[u4OneOne] // 0,0,1,1 punpcklwd mm1,mm1 // 0,0,Width,Width movq mm6,[pfxDeltaTex] // Load texture slopes. punpcklwd mm2,mm1 // Width,1,Width,1 movq mm7,[eax]TCopyLinearTrans.indCoord.pfIndex // // Iterate through the scanlines that intersect the subtriangle. // Y_LOOP: mov ebx,[bEvenScanlinesOnly] mov edx,[esi]CDrawPolygonBase.iY // Check for odd scan lines here and skip if even and edx,ebx jnz short END_OF_SCANLINE mov ebx,[eax]TCopyLinearTrans.fxX.i4Fx mov ecx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,ebx mov edx,gsGlobals.pvScreen add ebx,ecx sar edi,16 mov ecx,[esi]CDrawPolygonBase.iLineStartIndex sar ebx,16 add ecx,ebx lea edx,[edx + ecx*2] sub edi,ebx jge short END_OF_SCANLINE // no pixels to draw //--------------------------------------------------------------------------- // Draw the scanline. // // mm6,mm7 = texture values. // // Setup registers: // // eax = texel_index2 ebx = texel_index1 // ecx = temp edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = base pointer // // mm0 = temp mm1 = temp // mm2 = mfactor mm3 = temp // mm6 = uvslope mm7 = uv (stepped once) // pf2id (m7,m7) // Convert texture values to 16.16 fixed. mov esi,[pvTextureBitmap] // Load texture ponter. movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 psrad mm1,16 // Shift for integer U2,V2 sub esi,4 // Hack to force SIB + offset in loop packssdw mm0,mm1 // Pack integer texture values // // Check for alignment on a dword boundry. // lea eax,[edx + edi * 2] // Destination address. and eax,3 // Check for dword alignment. jz INNER_LOOP_2P inc edi // One pixel left? jz ONE_PIXEL_LEFT // // Do one pixel for alignment. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy new U2,V2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 psrad mm0,16 // Shift for integer U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 packssdw mm1,mm0 // Pack integer texture values movq mm0,mm1 // Pack new U2,V2 with old U2,V2 test ebx,ebx jz INNER_LOOP_2P mov [edx + edi*2 - 2],bx // Store pixel 1 jmp short INNER_LOOP_2P // Align start of loop to 4 past the beginning of a cache line. ALIGN 16 nop nop nop nop // // Do two pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_2P: add edi,2 // Inrcement index by 2 jge FINISH_REMAINDER // Loop (while >2 pixels) pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift pixel 2 up. jz P2_TRANSPARENT test ebx,ebx // Test P1 jz P1_TRANSPARENT or eax,ebx // Combine pixels. mov [edx + edi*2 - 4],eax // Store both pixels. jmp short INNER_LOOP_2P P2_TRANSPARENT: // P1 may or may not be transparent. test ebx,ebx // Test P1 jz INNER_LOOP_2P mov [edx + edi*2 - 4],bx // Store pixel 1 jmp short INNER_LOOP_2P P1_TRANSPARENT: // P2 is not transparent. shr eax,16 // Shift pixel 2 down. mov [edx + edi*2 - 2],ax // Store pixel 2 jmp short INNER_LOOP_2P FINISH_REMAINDER: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz short ONE_PIXEL_LEFT // // Two pixels left, don't step texture co-ordinates. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 shl eax,16 // Shift pixel 2 up. jz P2_TRANSPARENT_END test ebx,ebx // Test P1 jz P1_TRANSPARENT_END or eax,ebx // Combine pixels. mov [edx - 4],eax // Store both pixels. jmp short END_OF_SCANLINE P2_TRANSPARENT_END: // P1 may or may not be transparent. test ebx,ebx // Test P1 jz END_OF_SCANLINE mov [edx - 4],bx // Store pixel 1 jmp short END_OF_SCANLINE P1_TRANSPARENT_END: // P2 is not transparent. shr eax,16 // Shift pixel 2 down. mov [edx - 2],ax // Store pixel 2 mov eax,[i_pixel] // Load remaining length. jmp short END_OF_SCANLINE ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 test ebx,ebx jz END_OF_SCANLINE mov [edx - 2],bx // Store pixel 1 // ------------------------------------------------------------------------- // Increment the base edge. // END_OF_SCANLINE: mov eax,[pscanGlbl] movq mm1,[pf_texture_inc] movq mm7,[eax]TCopyLinearTrans.indCoord.pfIndex mov ebx,[eax]TCopyLinearTrans.fxXDifference pfadd (m7,m1) add ebx,[fx_diff] jge short NO_UNDERFLOW // Undeflow of fxXDifference. add ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) jmp short EDGE_DONE NO_UNDERFLOW: cmp ebx,0x00010000 jl short EDGE_DONE // Overflow of fxXDifference. sub ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) EDGE_DONE: // Store modified variables and do looping. // Step length, line starting address. mov esi,[pdtriGlbl] mov [eax]TCopyLinearTrans.fxXDifference,ebx movq [eax]TCopyLinearTrans.indCoord.pfIndex,mm7 mov ebx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov ecx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add ecx,gsGlobals.u4LinePixels mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,ebx mov [esi]CDrawPolygonBase.iLineStartIndex,ecx mov ecx,[eax]TCopyLinearTrans.fxX mov edx,[fx_inc] add ecx,edx mov edx,[esi]CDrawPolygonBase.iY mov [eax]TCopyLinearTrans.fxX,ecx inc edx mov ebx,[esi]CDrawPolygonBase.iYTo mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } //***************************************************************************************** // // Perspective, no Clut, non-transparent. // void DrawSubtriangle(TCopyPersp* pscan, CDrawPolygon* pdtri) { ::fixed fx_inc; ::fixed fx_diff; TCopyPersp* plinc = &pdtri->pedgeBase->lineIncrement; __asm { // Get ready to do MMX/3DX. femms // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // f_inc_uinvz = pdtri->pedgeBase->lineIncrement.indCoord.fUInvZ; // f_inc_vinvz = pdtri->pedgeBase->lineIncrement.indCoord.fVInvZ; // f_inc_invz = pdtri->pedgeBase->lineIncrement.indCoord.fInvZ; // mov eax,[plinc] mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. mov ebx,[eax]TCopyPersp.fxX mov edx,[eax]TCopyPersp.fxXDifference mov fx_inc,ebx movq mm0,[eax]TCopyPersp.indCoord.fUInvZ mov fx_diff,edx mov ebx,[eax]TCopyPersp.indCoord.fInvZ movq [tvEdgeStep.UInvZ],mm0 mov [tvEdgeStep.InvZ],ebx // // Setup: // QWORD qMFactor = width,1,width,1 // Load global texture values. // movd mm3,[iTexWidth] // 0,0,0,Width movd mm4,[u4OneOne] // 0,0,1,1 punpcklwd mm3,mm3 // 0,0,Width,Width punpcklwd mm4,mm3 // Width,1,Width,1 movq mm0,[ecx]TCopyPersp.indCoord.fUInvZ // mm0 = (V/Z,U/Z) movq [qMFactor],mm4 // Save mfactor movd mm1,[ecx]TCopyPersp.indCoord.fInvZ // mm1 = (?,1/Z) // // Iterate through the scanlines that intersect the subtriangle. // // esi,ecx,mm0,mm1 setup. // Y_LOOP: mov eax,[esi]CDrawPolygonBase.iY mov ebx,[bEvenScanlinesOnly] // Check to see if we sould skip this scanline. and eax,ebx jnz END_OF_SCANLINE // // Setup for prespective correction and looping. // PERSPECTIVE_SCANLINE_PROLOGUE(TCopyPersp) // // // Determine the start and end of the scanline. // mov ebx,[ecx]TCopyPersp.fxX.i4Fx mov eax,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,eax mov eax,[ecx]TCopyPersp.fxX.i4Fx sar ebx,16 sar eax,16 add edx,ebx sub eax,ebx // eax == i_pixel jge END_OF_SCANLINE // Load span increments. movq mm2,[tvDEdge.UInvZ] // Load V,U mov [i_screen_index],edx // Save scanline index. movd mm3,[tvDEdge.InvZ] // Load Z mov [i_pixel],eax // Save negative length of scanline. mov edi,[iSubdivideLen] // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfrcp (m4,m1) // f_z = mm4 = 1/fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m3,m1) // Step fGInvZ mov edx,[bClampUV] // Load clamp flag. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated 1/Z pfmul (m4,m0) // mm4 = V1,U1 pfrcp (m0,m3) // f_next_z = mm0 = 1/fGInvZ movq [tvCurUVZ.UInvZ],mm2 // Save updated U/Z,V/Z movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight pfmax (m4,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m4,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif punpckldq mm1,mm1 // Duplicate reciprocal of span width. movq [pfCurUV],mm0 // Save ending U,V pfsub (m0,m4) // mm0 = V2-V1,U2-U1 pfmul (m4,m2) // Scale to starting U,V 16.16 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. pfmul (m0,m1) // DV*1/Width,DU*1/Width pf2id (m7,m4) // Starting V,U in mm7 sub edi,eax // edi = inner loop count mov ebx,[i_screen_index] // Load scanline offset. pfmul (m0,m2) // Scale U,V slope to 16.16 mov edx,gsGlobals.pvScreen // Pointer the screen. mov esi,[pvTextureBitmap] // Load texture base pointer. add eax,ebx // Add scanline offset to i_pixel pf2id (m6,m0) // VStep,UStep in mm6 lea edx,[edx+eax*2] // Base of span in edx. SUBDIVISION_LOOP: //--------------------------------------------------------------------------- // Draw the subdivision. // // mm6,mm7 = texture values. // // Setup registers: // // eax = texel_index2 ebx = texel_index1 // ecx = temp edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = base pointer // // mm0 = temp mm1 = temp // mm2 = mfactor mm3 = temp // mm6 = uvslope mm7 = uv (stepped once) // movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 movq mm2,[qMFactor] // Load texture multipliers psrad mm1,16 // Shift for integer U2,V2 sub esi,4 // Hack to force SIB + offset in loop packssdw mm0,mm1 // Pack integer texture values // // Check for alignment on a dword boundry. // lea eax,[edx + edi * 2] and eax,3 jz ALIGNED2 inc edi jz ONE_PIXEL_LEFT // // Do one pixel for alignment. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy new U2,V2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 psrad mm0,16 // Shift for integer U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 packssdw mm1,mm0 // Pack integer texture values movq mm0,mm1 // Pack new U2,V2 with old U2,V2 lea eax,[edx + edi * 2] // Update eax. mov [edx + edi*2 - 2],bx // Store pixel 1 ALIGNED2: and eax,7 jz ALIGNED4 add edi,2 jge FINISH_REMAINDER2 // // Do two pixels for alignment. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. mov [edx + edi*2 - 4],eax // Store both pixels ALIGNED4: // // Make sure we have more than 4 pixels left. // add edi,4 jl INNER_LOOP_4P jmp FINISH_REMAINDER4 // Align start of loop to 4 past the beginning of a cache line. ALIGN 16 nop nop nop nop // // Do four pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_4P: pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V or eax,ebx // Combine pixels. movd mm3,eax // Copy P2,P1 movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. movd mm4,eax // Copy P4,P3 punpckldq mm3,mm4 // Combine P4,P3,P2,P1 movq [edx + edi*2 - 8],mm3 // Store 4 pixels add edi,4 // Increment index by 4 jl INNER_LOOP_4P // Loop (while >4 pixels) FINISH_REMAINDER4: // // One (edi == 3), two (edi == 2), three (edi == 1) or four (edi == 0) pixels left. // jnz THREE_OR_LESS // // Four pixels left, step texture coords 3 times. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up. pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V or eax,ebx // Combine pixels movd mm3,eax // Copy P2,P1 movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. movd mm4,eax // Copy P4,P3 punpckldq mm3,mm4 // Combine P4,P3,P2,P1 movq [edx - 8],mm3 // Store 4 pixels mov eax,[i_pixel] // Load remaining length. jmp END_OF_SPAN THREE_OR_LESS: sub edi,2 jge FINISH_REMAINDER2 // // Three pixels left, step texture coords twice. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. mov [edx - 6],eax // Store both pixels pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,[i_pixel] // Load remaining length. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 mov [edx - 2],bx // Store pixel 1 jmp END_OF_SPAN FINISH_REMAINDER2: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. mov [edx - 4],eax // Store both pixels mov eax,[i_pixel] // Load remaining length. jmp END_OF_SPAN ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,[i_pixel] // Load remaining length. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 mov [edx - 2],bx // Store pixel 1 END_OF_SPAN: // See if there are any pixels left. cmp eax,0 jz short END_OF_SCANLINE // // Do perspective correction and looping. // PERSPECTIVE_SCANLINE_EPILOGUE // //---------------------------------------------------------------------------------- // Implement perspective correction. // // eax = i_pixel // Preserve: mm7 // mov edi,[iSubdivideLen] // Load last texture values. movq mm0,[tvCurUVZ.UInvZ] // mm0 = (V/Z,U/Z) movd mm1,[tvCurUVZ.InvZ] // mm1 = (?,1/Z) // Load span increments. movq mm2,[tvDEdge.UInvZ] // V,U movd mm3,[tvDEdge.InvZ] // Z // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL_END // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL_END: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfadd (m3,m1) // Step fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated fGInvZ pfrcp (m0,m3) // f_z = mm0 = 1/fGInvZ movq mm4,[pfCurUV] // Load last U,V movq [tvCurUVZ.UInvZ],mm2 // Save updated fGUInvZ,fGVInvZ movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif movq [pfCurUV],mm0 // Save current U,V punpckldq mm1,mm1 // Duplicate across entire register. pfsub (m0,m4) // V2-V1,U2-U1 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. mov edx,gsGlobals.pvScreen // Pointer the screen. pfmul (m0,m1) // DV*1/Width,DU*1/Width mov ebx,[i_screen_index] // Load scanline offset. sub edi,eax // edi = inner loop count mov esi,[pvTextureBitmap] // Load texture base pointer. pfmul (m0,m2) // Scale to 16.16 add eax,ebx // Add scanline offset to i_pixel lea edx,[edx+eax*2] // Base of span in edx. pf2id (m6,m0) // VStep,UStep in mm6 jmp SUBDIVISION_LOOP END_OF_SCANLINE: // // Increment the base edge. // mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. // Step length, line starting address. mov edx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,[esi]CDrawPolygonBase.iLineStartIndex add edx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add edi,gsGlobals.u4LinePixels mov eax,[ecx]TCopyPersp.fxX.i4Fx mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,edx mov ebx,[ecx]TCopyPersp.fxXDifference.i4Fx mov [esi]CDrawPolygonBase.iLineStartIndex,edi // Step edge values. movq mm0,[ecx]TCopyPersp.indCoord.fUInvZ movq mm2,[tvEdgeStep.UInvZ] movd mm1,[ecx]TCopyPersp.indCoord.fInvZ movd mm3,[tvEdgeStep.InvZ] pfadd (m0,m2) mov edi,[fx_diff] pfadd (m1,m3) add eax,[fx_inc] add ebx,edi jge short NO_UNDERFLOW // Underflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] add ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) jmp short FINISH_LOOPING NO_UNDERFLOW: cmp ebx,0x00010000 jl short FINISH_LOOPING // Overflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] sub ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) FINISH_LOOPING: // Save values and Y Loop control. movq [ecx]TCopyPersp.indCoord.fUInvZ,mm0 movd [ecx]TCopyPersp.indCoord.fInvZ,mm1 mov [ecx]TCopyPersp.fxX.i4Fx,eax mov edx,[esi]CDrawPolygonBase.iY mov [ecx]TCopyPersp.fxXDifference.i4Fx,ebx mov ebx,[esi]CDrawPolygonBase.iYTo inc edx mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } //***************************************************************************************** // // Linear, no clut, non-transparent. // //********************************** void DrawSubtriangle(TCopyLinear* pscan, CDrawPolygon* pdtri) { typedef TCopyLinear::TIndex prevent_internal_compiler_errors; static CDrawPolygon* pdtriGlbl; static TCopyLinear* pscanGlbl; static PackedFloatUV pf_texture_inc; TCopyLinear* plinc = &pdtri->pedgeBase->lineIncrement; __asm { // Get ready to do MMX/3DX. femms mov edi,[plinc] // Pointer to scanline increment. mov esi,[pdtri] // Pointer to polygon object. mov eax,[pscan] // Pointer to scanline object. mov pdtriGlbl,esi // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // pf_texture_inc = pdtri->pedgeBase->lineIncrement.indCoord.pfIndex; // pdtriGlbl = pdtri; // pscanGlbl = pscan; // fx_gour_inc = pdtri->pedgeBase->lineIncrement.gourIntensity.fxIntensity; // mov ebx,[edi]TCopyLinear.fxX mov pscanGlbl,eax mov ecx,[edi]TCopyLinear.fxXDifference mov fx_inc,ebx movq mm0,[edi]TCopyLinear.indCoord.pfIndex mov fx_diff,ecx movq [pf_texture_inc],mm0 // // Setup: // // mm2 = qMFactor = width,1,width,1 // mm3 = qUVMasks // mm6 = uvslope // movd mm1,[iTexWidth] // 0,0,0,Width movd mm2,[u4OneOne] // 0,0,1,1 punpcklwd mm1,mm1 // 0,0,Width,Width movq mm6,[pfxDeltaTex] // Load texture slopes. punpcklwd mm2,mm1 // Width,1,Width,1 movq mm7,[eax]TCopyLinear.indCoord.pfIndex // // Iterate through the scanlines that intersect the subtriangle. // Y_LOOP: mov ebx,[bEvenScanlinesOnly] mov edx,[esi]CDrawPolygonBase.iY // Check for odd scan lines here and skip if even and edx,ebx jnz short END_OF_SCANLINE mov ebx,[eax]TCopyLinear.fxX.i4Fx mov ecx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,ebx mov edx,gsGlobals.pvScreen add ebx,ecx sar edi,16 mov ecx,[esi]CDrawPolygonBase.iLineStartIndex sar ebx,16 add ecx,ebx lea edx,[edx + ecx*2] sub edi,ebx jge short END_OF_SCANLINE // no pixels to draw //--------------------------------------------------------------------------- // Draw the scanline. // // mm6,mm7 = texture values. // // Setup registers: // // eax = texel_index2 ebx = texel_index1 // ecx = temp edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = base pointer // // mm0 = temp mm1 = temp // mm2 = mfactor mm3 = temp // mm6 = uvslope mm7 = uv (stepped once) // pf2id (m7,m7) // Convert texture values to 16.16 fixed. mov esi,[pvTextureBitmap] // Load texture ponter. movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 psrad mm1,16 // Shift for integer U2,V2 sub esi,4 // Hack to force SIB + offset in loop packssdw mm0,mm1 // Pack integer texture values // // Check for alignment on a dword boundry. // lea eax,[edx + edi * 2] and eax,3 jz ALIGNED2 inc edi jz ONE_PIXEL_LEFT // // Do one pixel for alignment. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy new U2,V2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 psrad mm0,16 // Shift for integer U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 packssdw mm1,mm0 // Pack integer texture values movq mm0,mm1 // Pack new U2,V2 with old U2,V2 lea eax,[edx + edi * 2] // Update eax. mov [edx + edi*2 - 2],bx // Store pixel 1 ALIGNED2: and eax,7 jz ALIGNED4 add edi,2 jge FINISH_REMAINDER2 // // Do two pixels for alignment. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. mov [edx + edi*2 - 4],eax // Store both pixels ALIGNED4: // // Make sure we have more than 4 pixels left. // add edi,4 jl INNER_LOOP_4P jmp FINISH_REMAINDER4 // Align start of loop to 4 past the beginning of a cache line. ALIGN 16 nop nop nop nop // // Do four pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_4P: pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V or eax,ebx // Combine pixels. movd mm3,eax // Copy P2,P1 movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. movd mm4,eax // Copy P4,P3 punpckldq mm3,mm4 // Combine P4,P3,P2,P1 movq [edx + edi*2 - 8],mm3 // Store 4 pixels add edi,4 // Increment index by 4 jl INNER_LOOP_4P // Loop (while >4 pixels) FINISH_REMAINDER4: // // One (edi == 3), two (edi == 2), three (edi == 1) or four (edi == 0) pixels left. // jnz THREE_OR_LESS // // Four pixels left, step texture coords 3 times. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up. pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V or eax,ebx // Combine pixels movd mm3,eax // Copy P2,P1 movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. movd mm4,eax // Copy P4,P3 punpckldq mm3,mm4 // Combine P4,P3,P2,P1 movq [edx - 8],mm3 // Store 4 pixels jmp END_OF_SCANLINE THREE_OR_LESS: sub edi,2 jge FINISH_REMAINDER2 // // Three pixels left, step texture coords twice. // pmaddwd mm0,mm2 // Compute texture indicies. paddd mm7,mm6 // Step U,V movq [qIndexTemp],mm0 // Save texture indicies. movq mm0,mm7 // Copy U1,V1 mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 paddd mm7,mm6 // Step U,V movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. mov [edx - 6],eax // Store both pixels pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 mov [edx - 2],bx // Store pixel 1 jmp END_OF_SCANLINE FINISH_REMAINDER2: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp + 4] // Get texture index 2 mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 2 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 shl eax,16 // Shift second pixel up or eax,ebx // Combine pixels. mov [edx - 4],eax // Store both pixels jmp END_OF_SCANLINE ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // pmaddwd mm0,mm2 // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 mov [edx - 2],bx // Store pixel 1 // ------------------------------------------------------------------------- // Increment the base edge. // END_OF_SCANLINE: mov eax,[pscanGlbl] movq mm1,[pf_texture_inc] movq mm7,[eax]TCopyLinear.indCoord.pfIndex mov ebx,[eax]TCopyLinear.fxXDifference pfadd (m7,m1) add ebx,[fx_diff] jge short NO_UNDERFLOW // Undeflow of fxXDifference. add ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) jmp short EDGE_DONE NO_UNDERFLOW: cmp ebx,0x00010000 jl short EDGE_DONE // Overflow of fxXDifference. sub ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) EDGE_DONE: // Store modified variables and do looping. // Step length, line starting address. mov esi,[pdtriGlbl] mov [eax]TCopyLinear.fxXDifference,ebx movq [eax]TCopyLinear.indCoord.pfIndex,mm7 mov ebx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov ecx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add ecx,gsGlobals.u4LinePixels mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,ebx mov [esi]CDrawPolygonBase.iLineStartIndex,ecx mov ecx,[eax]TCopyLinear.fxX mov edx,[fx_inc] add ecx,edx mov edx,[esi]CDrawPolygonBase.iY mov [eax]TCopyLinear.fxX,ecx inc edx mov ebx,[esi]CDrawPolygonBase.iYTo mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } //***************************************************************************************** // // Perspective, Terrain Clut, non-transparent. // void DrawSubtriangle(TCopyTerrainPersp* pscan, CDrawPolygon* pdtri) { static void* pvLastTexture = 0; static void* pvLastFogClut = 0; TCopyTerrainPersp* plinc = &pdtri->pedgeBase->lineIncrement; Assert(pu2TerrainTextureFogClut); __asm { // Get ready to do MMX/3DX. femms // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // f_inc_uinvz = pdtri->pedgeBase->lineIncrement.indCoord.fUInvZ; // f_inc_vinvz = pdtri->pedgeBase->lineIncrement.indCoord.fVInvZ; // f_inc_invz = pdtri->pedgeBase->lineIncrement.indCoord.fInvZ; // mov eax,[plinc] mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. mov ebx,[eax]TCopyTerrainPersp.fxX mov edx,[eax]TCopyTerrainPersp.fxXDifference mov fx_inc,ebx movq mm0,[eax]TCopyTerrainPersp.indCoord.fUInvZ mov fx_diff,edx mov ebx,[eax]TCopyTerrainPersp.indCoord.fInvZ movq [tvEdgeStep.UInvZ],mm0 mov [tvEdgeStep.InvZ],ebx // // Setup: // QWORD qMFactor = width,1,width,1 // Load global texture values. // movd mm3,[iTexWidth] // 0,0,0,Width movd mm4,[u4OneOne] // 0,0,1,1 punpcklwd mm3,mm3 // 0,0,Width,Width punpcklwd mm4,mm3 // Width,1,Width,1 movq mm0,[ecx]TCopyTerrainPersp.indCoord.fUInvZ // mm0 = (V/Z,U/Z) movq [qMFactor],mm4 // Save mfactor movd mm1,[ecx]TCopyTerrainPersp.indCoord.fInvZ // mm1 = (?,1/Z) // // Iterate through the scanlines that intersect the subtriangle. // // esi,ecx,mm0,mm1 setup. // Y_LOOP: mov eax,[esi]CDrawPolygonBase.iY mov ebx,[bEvenScanlinesOnly] push ebp // Check to see if we sould skip this scanline. and eax,ebx jnz END_OF_SCANLINE // // Setup for prespective correction and looping. // PERSPECTIVE_SCANLINE_PROLOGUE(TCopyTerrainPersp) // // // Determine the start and end of the scanline. // mov ebx,[ecx]TCopyTerrainPersp.fxX.i4Fx mov eax,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,eax mov eax,[ecx]TCopyTerrainPersp.fxX.i4Fx sar ebx,16 sar eax,16 add edx,ebx sub eax,ebx // eax == i_pixel jge END_OF_SCANLINE // Load span increments. movq mm2,[tvDEdge.UInvZ] // Load V,U mov [i_screen_index],edx // Save scanline index. movd mm3,[tvDEdge.InvZ] // Load Z mov [i_pixel],eax // Save negative length of scanline. mov edi,[iSubdivideLen] // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfrcp (m4,m1) // f_z = mm4 = 1/fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m3,m1) // Step fGInvZ mov edx,[bClampUV] // Load clamp flag. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated 1/Z pfmul (m4,m0) // mm4 = V1,U1 pfrcp (m0,m3) // f_next_z = mm0 = 1/fGInvZ movq [tvCurUVZ.UInvZ],mm2 // Save updated U/Z,V/Z movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight pfmax (m4,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m4,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif punpckldq mm1,mm1 // Duplicate reciprocal of span width. movq [pfCurUV],mm0 // Save ending U,V pfsub (m0,m4) // mm0 = V2-V1,U2-U1 pfmul (m4,m2) // Scale to starting U,V 16.16 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. pfmul (m0,m1) // DV*1/Width,DU*1/Width pf2id (m7,m4) // Starting V,U in mm7 sub edi,eax // edi = inner loop count mov ebx,[i_screen_index] // Load scanline offset. pfmul (m0,m2) // Scale U,V slope to 16.16 mov edx,gsGlobals.pvScreen // Pointer the screen. mov esi,[pvTextureBitmap] // Load texture base pointer. add eax,ebx // Add scanline offset to i_pixel pf2id (m6,m0) // VStep,UStep in mm6 lea edx,[edx+eax*2] // Base of span in edx. SUBDIVISION_LOOP: //--------------------------------------------------------------------------- // Draw the subdivision. // // mm6,mm7 = texture values. // // Setup registers: // // eax = texel_index2 ebx = texel_index2 // ecx = clut pointer edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = alpha index mask // // mm0 = temp mm1 = temp // mm2 = mfactor mm3 = uvmask // mm6 = uvslope mm7 = uv (stepped once) // movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 movq mm2,[qMFactor] // Load texture multipliers psrad mm1,16 // Shift for integer U2,V2 mov ecx,[pu2TerrainTextureFogClut] // Terrain clut pointer. Hack. packssdw mm0,mm1 // Pack integer texture values sub esi,4 // Hack to force SIB + offset in loop sub ecx,4 // Hack to force SIB + offset in loop pmaddwd mm0,mm2 // Compute texture indicies. mov ebp,[u4TerrainFogMask] // // Make sure we have more than 2 pixels left. // add edi,2 jl short INNER_LOOP_2P jmp FINISH_REMAINDER // Align start of loop to 0 past the beginning of a cache line. ALIGN 16 nop ALIGN 16 // // Do two pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_2P: movq [qIndexTemp],mm0 // Save texture indicies. paddd mm7,mm6 // Step U,V mov eax,dword ptr[qIndexTemp] // Get texture index 1 movq mm0,mm7 // Copy U1,V1 mov ebx,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 1 paddd mm7,mm6 // Step U,V movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. and eax,ebp // Apply alpha mask. pmaddwd mm0,mm2 // Compute texture indicies. movzx eax,word ptr[ecx + eax*2 + 4] // Clut lookup for p1 and ebx,ebp // Apply alpha mask. movzx ebx,word ptr[ecx + ebx*2 + 4] // Clut lookup for p2 mov [edx + edi*2 - 4],ax // Store pixel 1 mov [edx + edi*2 - 2],bx // Store pixel 2 add edi,2 // Inrcement index by 2 jl short INNER_LOOP_2P // Loop (while >2 pixels) FINISH_REMAINDER: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz short ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // movq [qIndexTemp],mm0 // Save texture indicies. paddd mm7,mm6 // Step U,V mov eax,dword ptr[qIndexTemp] // Get texture index 1 mov ebx,dword ptr[qIndexTemp + 4] // Get texture index 2 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 2 and eax,ebp // Apply alpha mask. movzx eax,word ptr[ecx + eax*2 + 4] // Clut lookup for p1 and ebx,ebp // Apply alpha mask. movzx ebx,word ptr[ecx + ebx*2 + 4] // Clut lookup for p2 mov [edx + edi*2 - 4],ax // Store pixel 1 mov [edx + edi*2 - 2],bx // Store pixel 2 mov eax,[i_pixel] // Load remaining length. jmp short END_OF_SPAN ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // movq [qIndexTemp],mm0 // Save texture indicies. mov eax,[i_pixel] // Load remaining length. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 and ebx,ebp // Apply alpha mask. movzx ebx,word ptr[ecx + ebx*2 + 4] // Clut lookup for p1 mov [edx + edi*2 - 4],bx // Store pixel 1 END_OF_SPAN: // See if there are any pixels left. cmp eax,0 jz short END_OF_SCANLINE // // Do perspective correction and looping. // PERSPECTIVE_SCANLINE_EPILOGUE // //---------------------------------------------------------------------------------- // Implement perspective correction. // // eax = i_pixel // Preserve: mm7 // mov edi,[iSubdivideLen] // Load last texture values. movq mm0,[tvCurUVZ.UInvZ] // mm0 = (V/Z,U/Z) movd mm1,[tvCurUVZ.InvZ] // mm1 = (?,1/Z) // Load span increments. movq mm2,[tvDEdge.UInvZ] // V,U movd mm3,[tvDEdge.InvZ] // Z // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL_END // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL_END: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfadd (m3,m1) // Step fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated fGInvZ pfrcp (m0,m3) // f_z = mm0 = 1/fGInvZ movq mm4,[pfCurUV] // Load last U,V movq [tvCurUVZ.UInvZ],mm2 // Save updated fGUInvZ,fGVInvZ movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif movq [pfCurUV],mm0 // Save current U,V punpckldq mm1,mm1 // Duplicate across entire register. pfsub (m0,m4) // V2-V1,U2-U1 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. mov edx,gsGlobals.pvScreen // Pointer the screen. pfmul (m0,m1) // DV*1/Width,DU*1/Width mov ebx,[i_screen_index] // Load scanline offset. sub edi,eax // edi = inner loop count mov esi,[pvTextureBitmap] // Load texture base pointer. pfmul (m0,m2) // Scale to 16.16 add eax,ebx // Add scanline offset to i_pixel lea edx,[edx+eax*2] // Base of span in edx. pf2id (m6,m0) // VStep,UStep in mm6 jmp SUBDIVISION_LOOP END_OF_SCANLINE: // // Increment the base edge. // pop ebp mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. // Step length, line starting address. mov edx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,[esi]CDrawPolygonBase.iLineStartIndex add edx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add edi,gsGlobals.u4LinePixels mov eax,[ecx]TCopyTerrainPersp.fxX.i4Fx mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,edx mov ebx,[ecx]TCopyTerrainPersp.fxXDifference.i4Fx mov [esi]CDrawPolygonBase.iLineStartIndex,edi // Step edge values. movq mm0,[ecx]TCopyTerrainPersp.indCoord.fUInvZ movq mm2,[tvEdgeStep.UInvZ] movd mm1,[ecx]TCopyTerrainPersp.indCoord.fInvZ movd mm3,[tvEdgeStep.InvZ] pfadd (m0,m2) mov edi,[fx_diff] pfadd (m1,m3) add eax,[fx_inc] add ebx,edi jge short NO_UNDERFLOW // Underflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] add ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) jmp short FINISH_LOOPING NO_UNDERFLOW: cmp ebx,0x00010000 jl short FINISH_LOOPING // Overflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] sub ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) FINISH_LOOPING: // Save values and Y Loop control. movq [ecx]TCopyTerrainPersp.indCoord.fUInvZ,mm0 movd [ecx]TCopyTerrainPersp.indCoord.fInvZ,mm1 mov [ecx]TCopyTerrainPersp.fxX.i4Fx,eax mov edx,[esi]CDrawPolygonBase.iY mov [ecx]TCopyTerrainPersp.fxXDifference.i4Fx,ebx mov ebx,[esi]CDrawPolygonBase.iYTo inc edx mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } //***************************************************************************************** // // Linear, Terrain clut, non-transparent. // //********************************** void DrawSubtriangle(TCopyTerrainLinear* pscan, CDrawPolygon* pdtri) { typedef TCopyTerrainLinear::TIndex prevent_internal_compiler_errors; static CDrawPolygon* pdtriGlbl; static TCopyTerrainLinear* pscanGlbl; static PackedFloatUV pf_texture_inc; TCopyTerrainLinear* plinc = &pdtri->pedgeBase->lineIncrement; __asm { // Get ready to do MMX/3DX. femms mov edi,[plinc] // Pointer to scanline increment. mov esi,[pdtri] // Pointer to polygon object. push ebp // Save base pointer. mov eax,[pscan] // Pointer to scanline object. mov pdtriGlbl,esi // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // pf_texture_inc = pdtri->pedgeBase->lineIncrement.indCoord.pfIndex; // pdtriGlbl = pdtri; // pscanGlbl = pscan; // fx_gour_inc = pdtri->pedgeBase->lineIncrement.gourIntensity.fxIntensity; // mov ebx,[edi]TCopyTerrainLinear.fxX mov pscanGlbl,eax mov ecx,[edi]TCopyTerrainLinear.fxXDifference mov fx_inc,ebx movq mm0,[edi]TCopyTerrainLinear.indCoord.pfIndex mov fx_diff,ecx movq [pf_texture_inc],mm0 // // Setup: // // mm2 = qMFactor = width,1,width,1 // mm3 = qUVMasks // mm6 = uvslope // movd mm1,[iTexWidth] // 0,0,0,Width movd mm2,[u4OneOne] // 0,0,1,1 punpcklwd mm1,mm1 // 0,0,Width,Width movq mm6,[pfxDeltaTex] // Load texture slopes. punpcklwd mm2,mm1 // Width,1,Width,1 movq mm7,[eax]TCopyTerrainLinear.indCoord.pfIndex mov ebp,[u4TerrainFogMask] // // Iterate through the scanlines that intersect the subtriangle. // Y_LOOP: mov ebx,[bEvenScanlinesOnly] mov edx,[esi]CDrawPolygonBase.iY // Check for odd scan lines here and skip if even and edx,ebx jnz short END_OF_SCANLINE mov ebx,[eax]TCopyTerrainLinear.fxX.i4Fx mov ecx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,ebx mov edx,gsGlobals.pvScreen add ebx,ecx sar edi,16 mov ecx,[esi]CDrawPolygonBase.iLineStartIndex sar ebx,16 add ecx,ebx lea edx,[edx + ecx*2] sub edi,ebx jge short END_OF_SCANLINE // no pixels to draw //--------------------------------------------------------------------------- // Draw the scanline. // // eax = texel_index2 ebx = texel_index2 // ecx = clut pointer edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = alpha index mask // // mm0 = temp mm1 = temp // mm2 = mfactor mm3 = uvmask // mm6 = uvslope mm7 = uv (stepped once) // pf2id (m7,m7) // Convert texture values to 16.16 fixed. mov esi,[pvTextureBitmap] // Load texture base pointer. movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 mov ecx,[pu2TerrainTextureFogClut] // Terrain clut pointer. Hack. psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values sub esi,4 // Hack to force SIB + offset in loop sub ecx,4 // Hack to force SIB + offset in loop pmaddwd mm0,mm2 // Compute texture indicies. // // Make sure we have more than 2 pixels left. // add edi,2 jl short INNER_LOOP_2P jmp FINISH_REMAINDER // Align start of loop to 0 past the beginning of a cache line. ALIGN 16 // // Do two pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_2P: movq [qIndexTemp],mm0 // Save texture indicies. paddd mm7,mm6 // Step U,V mov eax,dword ptr[qIndexTemp] // Get texture index 1 movq mm0,mm7 // Copy U1,V1 mov ebx,dword ptr[qIndexTemp + 4] // Get texture index 2 psrad mm0,16 // Shift for integer U1,V1 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 1 paddd mm7,mm6 // Step U,V movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 2 movq mm1,mm7 // Copy U2,V2 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values. and eax,ebp // Apply alpha mask. pmaddwd mm0,mm2 // Compute texture indicies. movzx eax,word ptr[ecx + eax*2 + 4] // Clut lookup for p1 and ebx,ebp // Apply alpha mask. movzx ebx,word ptr[ecx + ebx*2 + 4] // Clut lookup for p2 mov [edx + edi*2 - 4],ax // Store pixel 1 mov [edx + edi*2 - 2],bx // Store pixel 2 add edi,2 // Inrcement index by 2 jl short INNER_LOOP_2P // Loop (while >2 pixels) FINISH_REMAINDER: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz short ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp] // Get texture index 1 mov ebx,dword ptr[qIndexTemp + 4] // Get texture index 2 movzx eax,word ptr[esi + eax*2 + 4] // Lookup texel 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 2 and eax,ebp // Apply alpha mask. movzx eax,word ptr[ecx + eax*2 + 4] // Clut lookup for p1 and ebx,ebp // Apply alpha mask. movzx ebx,word ptr[ecx + ebx*2 + 4] // Clut lookup for p2 mov [edx + edi*2 - 4],ax // Store pixel 1 mov [edx + edi*2 - 2],bx // Store pixel 2 jmp short END_OF_SCANLINE ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // movq [qIndexTemp],mm0 // Save texture indicies. mov ebx,dword ptr[qIndexTemp] // Get texture index 1 movzx ebx,word ptr[esi + ebx*2 + 4] // Lookup texel 1 and ebx,ebp // Apply alpha mask. movzx ebx,word ptr[ecx + ebx*2 + 4] // Clut lookup for p1 mov [edx + edi*2 - 4],bx // Store pixel 1 // ------------------------------------------------------------------------- // Increment the base edge. // END_OF_SCANLINE: mov eax,[pscanGlbl] movq mm1,[pf_texture_inc] movq mm7,[eax]TCopyTerrainLinear.indCoord.pfIndex mov ebx,[eax]TCopyTerrainLinear.fxXDifference pfadd (m7,m1) add ebx,[fx_diff] jge short NO_UNDERFLOW // Undeflow of fxXDifference. add ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) jmp short EDGE_DONE NO_UNDERFLOW: cmp ebx,0x00010000 jl short EDGE_DONE // Overflow of fxXDifference. sub ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) EDGE_DONE: // Store modified variables and do looping. // Step length, line starting address. mov esi,[pdtriGlbl] mov [eax]TCopyTerrainLinear.fxXDifference,ebx movq [eax]TCopyTerrainLinear.indCoord.pfIndex,mm7 mov ebx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov ecx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add ecx,gsGlobals.u4LinePixels mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,ebx mov [esi]CDrawPolygonBase.iLineStartIndex,ecx mov ecx,[eax]TCopyTerrainLinear.fxX mov edx,[fx_inc] add ecx,edx mov edx,[esi]CDrawPolygonBase.iY mov [eax]TCopyTerrainLinear.fxX,ecx inc edx mov ebx,[esi]CDrawPolygonBase.iYTo mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms pop ebp // Restore base pointer. } } #if (BILINEAR_FILTER) #pragma code_seg("SelfMod") // // Globals used by filtering. // static uint64 qUVStep; static uint64 usteps; static uint64 vsteps; const uint64 mask_greens_555 = 0x03e003e003e003e0; const uint64 mask_greens_565 = 0x07e007e007e007e0; const uint64 mask_blues = 0x001f001f001f001f; const uint64 red_green_shift_555 = 0x0020002004000400; // 2^5,2^5,2^10,2^10 const uint64 red_green_shift_565 = 0x0020002008000800; // 2^5,2^5,2^11,2^11 const uint64 red_green_mask_555 = 0x0003e00000f80000; const uint64 red_green_mask_565 = 0x0007e00000f80000; const uint64 uf_xors = 0x00000000ffffffff; const uint64 uf_adds = 0x0000000000010001; const uint64 vf_xors = 0x0000ffff0000ffff; const uint64 vf_adds = 0x0000000100000001; // Default setup for 565 color. static uint64 mask_greens = mask_greens_565; static uint64 red_green_shift = red_green_shift_565; static uint64 red_green_mask = red_green_mask_565; //***************************************************************************************** // // Perspective, no Clut, non-transparent, filtered. // void DrawSubtriangle(TCopyPerspFilter* pscan, CDrawPolygon* pdtri) { ::fixed fx_inc; ::fixed fx_diff; static int iBits = 16; TCopyPerspFilter* plinc = &pdtri->pedgeBase->lineIncrement; // // This type is the same as TCopyPerspFilter, except that it uses the parent class of // CIndexPerspectiveFilter to allow us to access members of it. // typedef CScanline, CIndexPerspective, CColLookupOff> FakeTCopyPerspFilter; __asm { // Get ready to do MMX/3DX. femms // Check/Modify for 555/565 color. mov eax,gsGlobals.iBits cmp eax,[iBits] je short DONE_WITH_MODIFY mov [iBits],eax cmp eax,16 je MODIFY_FOR_565 // Change to 555. movq mm0,mask_greens_555 movq mask_greens,mm0 movq mm0,red_green_shift_555 movq red_green_shift,mm0 movq mm0,red_green_mask_555 movq red_green_mask,mm0 lea eax,MODIFY_FOR_RED_SHIFT_A mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_B mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_C mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_D mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_E mov byte ptr[eax-1],10 jmp short DONE_WITH_MODIFY MODIFY_FOR_565: // Change to 565. movq mm0,mask_greens_565 movq mask_greens,mm0 movq mm0,red_green_shift_565 movq red_green_shift,mm0 movq mm0,red_green_mask_565 movq red_green_mask,mm0 lea eax,MODIFY_FOR_RED_SHIFT_A mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_B mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_C mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_D mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_E mov byte ptr[eax-1],11 DONE_WITH_MODIFY: // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // f_inc_uinvz = pdtri->pedgeBase->lineIncrement.indCoord.fUInvZ; // f_inc_vinvz = pdtri->pedgeBase->lineIncrement.indCoord.fVInvZ; // f_inc_invz = pdtri->pedgeBase->lineIncrement.indCoord.fInvZ; // mov eax,[plinc] mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. mov ebx,[eax]TCopyPerspFilter.fxX mov edx,[eax]TCopyPerspFilter.fxXDifference mov fx_inc,ebx movq mm0,[eax]FakeTCopyPerspFilter.indCoord.fUInvZ mov fx_diff,edx mov ebx,[eax]FakeTCopyPerspFilter.indCoord.fInvZ movq [tvEdgeStep.UInvZ],mm0 mov [tvEdgeStep.InvZ],ebx // // Setup: // QWORD qMFactor = width,1,width,1 // Load global texture values. // movd mm3,[iTexWidth] // 0,0,0,Width movd mm4,[u4OneOne] // 0,0,1,1 punpcklwd mm3,mm3 // 0,0,Width,Width punpcklwd mm4,mm3 // Width,1,Width,1 movq mm0,[ecx]FakeTCopyPerspFilter.indCoord.fUInvZ // mm0 = (V/Z,U/Z) movq [qMFactor],mm4 // Save mfactor movd mm1,[ecx]FakeTCopyPerspFilter.indCoord.fInvZ // mm1 = (?,1/Z) // // Iterate through the scanlines that intersect the subtriangle. // // esi,ecx,mm0,mm1 setup. // Y_LOOP: mov eax,[esi]CDrawPolygonBase.iY mov ebx,[bEvenScanlinesOnly] // Check to see if we sould skip this scanline. and eax,ebx jnz END_OF_SCANLINE // // Setup for prespective correction and looping. // PERSPECTIVE_SCANLINE_PROLOGUE(TCopyPerspFilter) // // // Determine the start and end of the scanline. // mov ebx,[ecx]TCopyPerspFilter.fxX.i4Fx mov eax,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,eax mov eax,[ecx]TCopyPerspFilter.fxX.i4Fx sar ebx,16 sar eax,16 add edx,ebx sub eax,ebx // eax == i_pixel jge END_OF_SCANLINE // Load span increments. movq mm2,[tvDEdge.UInvZ] // Load V,U mov [i_screen_index],edx // Save scanline index. movd mm3,[tvDEdge.InvZ] // Load Z mov [i_pixel],eax // Save negative length of scanline. mov edi,[iSubdivideLen] // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfrcp (m4,m1) // f_z = mm4 = 1/fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m3,m1) // Step fGInvZ mov edx,[bClampUV] // Load clamp flag. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated 1/Z pfmul (m4,m0) // mm4 = V1,U1 pfrcp (m0,m3) // f_next_z = mm0 = 1/fGInvZ movq [tvCurUVZ.UInvZ],mm2 // Save updated U/Z,V/Z movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight pfmax (m4,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m4,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif punpckldq mm1,mm1 // Duplicate reciprocal of span width. movq [pfCurUV],mm0 // Save ending U,V pfsub (m0,m4) // mm0 = V2-V1,U2-U1 pfmul (m4,m2) // Scale to starting U,V 16.16 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. pfmul (m0,m1) // DV*1/Width,DU*1/Width pf2id (m7,m4) // Starting V,U in mm7 sub edi,eax // edi = inner loop count mov ebx,[i_screen_index] // Load scanline offset. pfmul (m0,m2) // Scale U,V slope to 16.16 mov edx,gsGlobals.pvScreen // Pointer the screen. mov esi,[pvTextureBitmap] // Load texture base pointer. add eax,ebx // Add scanline offset to i_pixel pf2id (m6,m0) // VStep,UStep in mm6 lea edx,[edx+eax*2] // Base of span in edx. SUBDIVISION_LOOP: //--------------------------------------------------------------------------- // Draw the subdivision. // // mm6,mm7 = texture values. // // eax = texel_index1 ebx = texel_index2 // ecx = esi+twidth edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = base pointer // // mm0 = temp mm1 = temp // mm2 = mfactor/temp mm3 = temp // mm4 = temp mm5 = u fracs // mm6 = uvslope/v fracs mm7 = uv // movq [qUVStep],mm6 // Save span slopes. // Setup usteps and vsteps movq mm3,mm6 movq mm4,mm6 punpcklwd mm3,mm3 psrlq mm4,32 punpckldq mm3,mm3 punpcklwd mm4,mm4 pxor mm3,dword ptr[uf_xors] punpckldq mm4,mm4 paddw mm3,dword ptr[uf_adds] pxor mm4,dword ptr[vf_xors] paddw mm4,dword ptr[vf_adds] movq [usteps],mm3 movq [vsteps],mm4 mov ecx,[iTexWidth] lea ecx,[esi+ecx*2] // Setup mm5 with u fractions, mm6 with v fractions, initial texture walk. movq mm5,mm7 movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values pmaddwd mm0,[qMFactor] // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp] // Get texture index 1 mov ebx,dword ptr[qIndexTemp + 4] // Get texture index 2 movq mm6,mm5 punpcklwd mm5,mm5 // ?,du,?,du psrlq mm6,32 punpckldq mm5,mm5 // du,du,du,du punpcklwd mm6,mm6 // ?,dv,?,dv pxor mm5,dword ptr[uf_xors] punpckldq mm6,mm6 // dv,dv,dv,dv pxor mm6,dword ptr[vf_xors] dec esi // Force sib+offset in loop. dec ecx // Force sib+offset in loop. // // Make sure we have more than 2 pixels left. // add edi,2 jl short INNER_LOOP_2P jmp FINISH_REMAINDER // Align start of loop to 7 past the beginning of a cache line. ALIGN 16 nop nop nop nop nop nop nop // // Do two pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_2P: movd mm0,[esi+eax*2+1] // lookup texels (0,0,T1,T0) movq mm3,mm5 // copy u fractions punpcklwd mm0,[ecx+eax*2+1] // lookup texels (T3, T1, T2, T0) movq mm4,mm6 // copy v fractions psrlw mm3,4 // shift ufracs to be unsigned paddw mm5,[usteps] // advance u fractions psrlw mm4,4 // shift vfracs to be unsinged paddw mm6,[vsteps] // advance v fractions movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_A: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movd mm0,[esi+ebx*2+1] // lookup texels (0,0,T1,T0) pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. psrlq mm4,32 // Shift high blues down. punpcklwd mm0,[ecx+ebx*2+1] // lookup texels (T3, T1, T2, T0) paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. movq mm3,mm5 // copy u fractions psrld mm2,8 // Shift into final position. movq mm4,mm6 // copy v fractions movd eax,mm2 // Get pixel value. psrlw mm3,4 // shift ufracs to be unsigned mov [edx + edi*2 - 4],ax // Store pixel 1 paddw mm5,[usteps] // advance u fractions nop // for cache alignment psrlw mm4,4 // shift vfracs to be unsinged paddw mm6,[vsteps] // advance v fractions movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_B: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movq mm0,[qUVStep] // Load texture step. pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. paddd mm7,mm0 // Step U,V psrlq mm4,32 // Shift high blues down. paddd mm2,mm4 // Sum blues. (X, B) movq mm4,mm7 // Copy U1,V1 paddd mm7,mm0 // Step U,V pand mm1,mm3 // Mask reds and greens. psrad mm4,16 // Shift for integer U1,V1 movq mm0,mm7 // Copy U2,V2 por mm2,mm1 // Combine blue and red. psrad mm0,16 // Shift for integer U2,V2 packssdw mm4,mm0 // Pack integer texture values. movq mm3,[qMFactor] // Texture multipliers psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. pmaddwd mm4,mm3 // Compute texture indicies. psrld mm2,8 // Shift into final position. movq [qIndexTemp],mm4 // Save texture indicies. movd ebx,mm2 // Get pixel value. mov eax,dword ptr[qIndexTemp] // Load texel index 1. add edi,2 // Inrcement index by 2 mov [edx + edi*2 - 6],bx // Store pixel 2 mov ebx,dword ptr[qIndexTemp+4] // Load texel index 2. jl short INNER_LOOP_2P // Loop (while >2 pixels) FINISH_REMAINDER: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz short ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // movd mm0,[esi+eax*2+1] // lookup texels (0,0,T1,T0) movq mm3,mm5 // copy u fractions punpcklwd mm0,[ecx+eax*2+1] // lookup texels (T3, T1, T2, T0) movq mm4,mm6 // copy v fractions psrlw mm3,4 // shift ufracs to be unsigned paddw mm5,[usteps] // advance u fractions psrlw mm4,4 // shift vfracs to be unsinged paddw mm6,[vsteps] // advance v fractions movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_C: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movd mm0,[esi+ebx*2+1] // lookup texels (0,0,T1,T0) pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. psrlq mm4,32 // Shift high blues down. punpcklwd mm0,[ecx+ebx*2+1] // lookup texels (T3, T1, T2, T0) paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. movq mm3,mm5 // copy u fractions psrld mm2,8 // Shift into final position. movq mm4,mm6 // copy v fractions movd eax,mm2 // Get pixel value. psrlw mm3,4 // shift ufracs to be unsigned mov [edx + edi*2 - 4],ax // Store pixel 1 psrlw mm4,4 // shift vfracs to be unsinged movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_D: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movq mm0,[qUVStep] // Load texture step. pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. paddd mm7,mm0 // Step U,V psrlq mm4,32 // Shift high blues down. paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. psrld mm2,8 // Shift into final position. movd ebx,mm2 // Get pixel value. mov [edx + edi*2 - 2],bx // Store pixel 2 mov eax,[i_pixel] jmp short END_OF_SPAN ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // movd mm0,[esi+eax*2+1] // lookup texels (0,0,T1,T0) movq mm3,mm5 // copy u fractions punpcklwd mm0,[ecx+eax*2+1] // lookup texels (T3, T1, T2, T0) movq mm4,mm6 // copy v fractions psrlw mm3,4 // shift ufracs to be unsigned psrlw mm4,4 // shift vfracs to be unsinged movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_E: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. psrlq mm4,32 // Shift high blues down. paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. psrld mm2,8 // Shift into final position. movd eax,mm2 // Get pixel value. mov [edx + edi*2 - 4],ax // Store pixel 1 mov eax,[i_pixel] END_OF_SPAN: // See if there are any pixels left. cmp eax,0 jz short END_OF_SCANLINE // // Do perspective correction and looping. // PERSPECTIVE_SCANLINE_EPILOGUE // //---------------------------------------------------------------------------------- // Implement perspective correction. // // eax = i_pixel // Preserve: mm7 // mov edi,[iSubdivideLen] // Load last texture values. movq mm0,[tvCurUVZ.UInvZ] // mm0 = (V/Z,U/Z) movd mm1,[tvCurUVZ.InvZ] // mm1 = (?,1/Z) // Load span increments. movq mm2,[tvDEdge.UInvZ] // V,U movd mm3,[tvDEdge.InvZ] // Z // scan line is +ve add eax,edi // eax = i_pixel + SubdivideLen jle short DONE_DIVIDE_PIXEL_END // // Subdivision is smaller than iSubdivideLen // // Adjust span increments by -i_pixel * fInvSubdivideLen // mov edi,[i_pixel] // Integer width of span. xor eax,eax // Remaining width. neg edi movd mm4,edi // -i_pixel pi2fd (m4,m4) // Convert to fp. movd mm5,[fInvSubdivideLen] // 1/SubDivideLen pfmul (m4,m5) // -i_pixel * fInvSubdivideLen punpckldq mm4,mm4 pfmul (m2,m4) // Adjust V,U increments. pfmul (m3,m4) // Adjust Z increment. DONE_DIVIDE_PIXEL_END: // // Compute current U,V // Step fGUInvZ,fGVInvZ,fGInvZ // pfadd (m3,m1) // Step fGInvZ mov [iNextSubdivide],eax // Save length remaining. pfadd (m2,m0) // Step fGUInvZ,fGVInvZ movd [tvCurUVZ.InvZ],mm3 // Save updated fGInvZ pfrcp (m0,m3) // f_z = mm0 = 1/fGInvZ movq mm4,[pfCurUV] // Load last U,V movq [tvCurUVZ.UInvZ],mm2 // Save updated fGUInvZ,fGVInvZ movd mm1,float ptr[fInverseIntTable+edi*4] // Reciprocal of span width. pfmul (m0,m2) // mm0 = V2,U2 movq mm2,[pfFixed16Scale] // Load fixed point scale factors. #if (VER_CLAMP_UV_16BIT == TRUE) movq mm3,[pfTexEdgeTolerance] movq mm5,[cvMaxCoords] pfmax (m0,m3) // Clamp U1,V1 to >= fTexEdgeTolerance pfmin (m0,m5) // Clamp U1 < fTexWidth, V1 < fTexHeight #endif movq [pfCurUV],mm0 // Save current U,V punpckldq mm1,mm1 // Duplicate across entire register. pfsub (m0,m4) // V2-V1,U2-U1 mov edi,[i_pixel] // edi = current i_pixel mov [i_pixel],eax // Save updated i_pixel. mov edx,gsGlobals.pvScreen // Pointer the screen. pfmul (m0,m1) // DV*1/Width,DU*1/Width mov ebx,[i_screen_index] // Load scanline offset. sub edi,eax // edi = inner loop count mov esi,[pvTextureBitmap] // Load texture base pointer. pfmul (m0,m2) // Scale to 16.16 add eax,ebx // Add scanline offset to i_pixel lea edx,[edx+eax*2] // Base of span in edx. pf2id (m6,m0) // VStep,UStep in mm6 jmp SUBDIVISION_LOOP END_OF_SCANLINE: // // Increment the base edge. // mov esi,[pdtri] // Pointer to polygon object. mov ecx,[pscan] // Pointer to scanline object. // Step length, line starting address. mov edx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,[esi]CDrawPolygonBase.iLineStartIndex add edx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add edi,gsGlobals.u4LinePixels mov eax,[ecx]TCopyPerspFilter.fxX.i4Fx mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,edx mov ebx,[ecx]TCopyPerspFilter.fxXDifference.i4Fx mov [esi]CDrawPolygonBase.iLineStartIndex,edi // Step edge values. movq mm0,[ecx]FakeTCopyPerspFilter.indCoord.fUInvZ movq mm2,[tvEdgeStep.UInvZ] movd mm1,[ecx]FakeTCopyPerspFilter.indCoord.fInvZ movd mm3,[tvEdgeStep.InvZ] pfadd (m0,m2) mov edi,[fx_diff] pfadd (m1,m3) add eax,[fx_inc] add ebx,edi jge short NO_UNDERFLOW // Underflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] add ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) jmp short FINISH_LOOPING NO_UNDERFLOW: cmp ebx,0x00010000 jl short FINISH_LOOPING // Overflow. // Add borrow/carry adjustments to U,V,Z. movq mm2,[tvNegAdj.UInvZ] sub ebx,0x00010000 movd mm3,[tvNegAdj.InvZ] pfadd (m0,m2) pfadd (m1,m3) FINISH_LOOPING: // Save values and Y Loop control. movq [ecx]FakeTCopyPerspFilter.indCoord.fUInvZ,mm0 movd [ecx]FakeTCopyPerspFilter.indCoord.fInvZ,mm1 mov [ecx]TCopyPerspFilter.fxX.i4Fx,eax mov edx,[esi]CDrawPolygonBase.iY mov [ecx]TCopyPerspFilter.fxXDifference.i4Fx,ebx mov ebx,[esi]CDrawPolygonBase.iYTo inc edx mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } //***************************************************************************************** // // Linear, no clut, non-transparent, filtered. // //********************************** void DrawSubtriangle(TCopyLinearFilter* pscan, CDrawPolygon* pdtri) { typedef TCopyLinearFilter::TIndex prevent_internal_compiler_errors; static int iBits = 16; static CDrawPolygon* pdtriGlbl; static TCopyLinearFilter* pscanGlbl; static PackedFloatUV pf_texture_inc; TCopyLinearFilter* plinc = &pdtri->pedgeBase->lineIncrement; // // This type is the same as TCopyLinearFilter, except that it uses the parent class of // CIndexLinearFilter to allow us to access members of it. // typedef CScanline, CIndexLinear, CColLookupOff> FakeTCopyLinearFilter; __asm { // Get ready to do MMX/3DX. femms // Check/Modify for 555/565 color. mov eax,gsGlobals.iBits cmp eax,[iBits] je short DONE_WITH_MODIFY mov [iBits],eax cmp eax,16 je MODIFY_FOR_565 // Change to 555. movq mm0,mask_greens_555 movq mask_greens,mm0 movq mm0,red_green_shift_555 movq red_green_shift,mm0 movq mm0,red_green_mask_555 movq red_green_mask,mm0 lea eax,MODIFY_FOR_RED_SHIFT_A mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_B mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_C mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_D mov byte ptr[eax-1],10 lea eax,MODIFY_FOR_RED_SHIFT_E mov byte ptr[eax-1],10 jmp short DONE_WITH_MODIFY MODIFY_FOR_565: // Change to 565. movq mm0,mask_greens_565 movq mask_greens,mm0 movq mm0,red_green_shift_565 movq red_green_shift,mm0 movq mm0,red_green_mask_565 movq red_green_mask,mm0 lea eax,MODIFY_FOR_RED_SHIFT_A mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_B mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_C mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_D mov byte ptr[eax-1],11 lea eax,MODIFY_FOR_RED_SHIFT_E mov byte ptr[eax-1],11 DONE_WITH_MODIFY: mov edi,[plinc] // Pointer to scanline increment. mov esi,[pdtri] // Pointer to polygon object. mov eax,[pscan] // Pointer to scanline object. mov pdtriGlbl,esi // // Local copies of edge stepping values. // // fx_inc = pdtri->pedgeBase->lineIncrement.fxX; // fx_diff = pdtri->pedgeBase->lineIncrement.fxXDifference; // pf_texture_inc = pdtri->pedgeBase->lineIncrement.indCoord.pfIndex; // pdtriGlbl = pdtri; // pscanGlbl = pscan; // fx_gour_inc = pdtri->pedgeBase->lineIncrement.gourIntensity.fxIntensity; // mov ebx,[edi]TCopyLinearFilter.fxX mov pscanGlbl,eax mov ecx,[edi]TCopyLinearFilter.fxXDifference mov fx_inc,ebx movq mm0,[edi]FakeTCopyLinearFilter.indCoord.pfIndex mov fx_diff,ecx movq [pf_texture_inc],mm0 // // Setup: // // mm2 = qMFactor = width,1,width,1 // mm3 = qUVMasks // mm6 = uvslope // movd mm1,[iTexWidth] // 0,0,0,Width movd mm2,[u4OneOne] // 0,0,1,1 punpcklwd mm1,mm1 // 0,0,Width,Width movq mm6,[pfxDeltaTex] // Load texture slopes. punpcklwd mm2,mm1 // Width,1,Width,1 movq mm7,[eax]FakeTCopyLinearFilter.indCoord.pfIndex movq [qMFactor],mm2 // Save mfactor movq [qUVStep],mm6 // Save slopes // // Iterate through the scanlines that intersect the subtriangle. // Y_LOOP: mov ebx,[bEvenScanlinesOnly] mov edx,[esi]CDrawPolygonBase.iY // Check for odd scan lines here and skip if even and edx,ebx jnz short END_OF_SCANLINE mov ebx,[eax]TCopyLinearFilter.fxX.i4Fx mov ecx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov edi,ebx mov edx,gsGlobals.pvScreen add ebx,ecx sar edi,16 mov ecx,[esi]CDrawPolygonBase.iLineStartIndex sar ebx,16 add ecx,ebx lea edx,[edx + ecx*2] sub edi,ebx jge short END_OF_SCANLINE // no pixels to draw //--------------------------------------------------------------------------- // Draw the scanline. // // mm6,mm7 = texture values. // // Setup registers: // // eax = texel_index1 ebx = texel_index2 // ecx = esi+twidth edx = dest_base_ptr // esi = texture_base_ptr edi = pixel_offset & count // ebp = base pointer // // mm0 = temp mm1 = temp // mm2 = mfactor/temp mm3 = temp // mm4 = temp mm5 = u fracs // mm6 = uvslope/v fracs mm7 = uv // movq mm6,[qUVStep] // Load slopes pf2id (m7,m7) // Convert texture values to 16.16 fixed. // Setup usteps and vsteps movq mm3,mm6 movq mm4,mm6 punpcklwd mm3,mm3 psrlq mm4,32 punpckldq mm3,mm3 punpcklwd mm4,mm4 pxor mm3,dword ptr[uf_xors] punpckldq mm4,mm4 paddw mm3,dword ptr[uf_adds] pxor mm4,dword ptr[vf_xors] paddw mm4,dword ptr[vf_adds] movq [usteps],mm3 movq [vsteps],mm4 mov esi,[pvTextureBitmap] // Load texture ponter. mov ecx,[iTexWidth] lea ecx,[esi+ecx*2] // Setup mm5 with u fractions, mm6 with v fractions, initial texture walk. movq mm5,mm7 movq mm0,mm7 // Copy U1,V1 paddd mm7,mm6 // Step U,V psrad mm0,16 // Shift for integer U1,V1 movq mm1,mm7 // Copy U2,V2 psrad mm1,16 // Shift for integer U2,V2 packssdw mm0,mm1 // Pack integer texture values pmaddwd mm0,[qMFactor] // Compute texture indicies. movq [qIndexTemp],mm0 // Save texture indicies. mov eax,dword ptr[qIndexTemp] // Get texture index 1 mov ebx,dword ptr[qIndexTemp + 4] // Get texture index 2 movq mm6,mm5 punpcklwd mm5,mm5 // ?,du,?,du psrlq mm6,32 punpckldq mm5,mm5 // du,du,du,du punpcklwd mm6,mm6 // ?,dv,?,dv pxor mm5,dword ptr[uf_xors] punpckldq mm6,mm6 // dv,dv,dv,dv pxor mm6,dword ptr[vf_xors] dec esi // Force sib+offset in loop. dec ecx // Force sib+offset in loop. // // Make sure we have more than 2 pixels left. // add edi,2 jl short INNER_LOOP_2P jmp FINISH_REMAINDER // Align start of loop to 7 past the beginning of a cache line. ALIGN 16 nop ALIGN 16 nop nop nop nop nop nop nop // // Do two pixels at a time. // // mm0 = packed & masked U1,V1,U2,V2 // mm7 is stepped once. // INNER_LOOP_2P: movd mm0,[esi+eax*2+1] // lookup texels (0,0,T1,T0) movq mm3,mm5 // copy u fractions punpcklwd mm0,[ecx+eax*2+1] // lookup texels (T3, T1, T2, T0) movq mm4,mm6 // copy v fractions psrlw mm3,4 // shift ufracs to be unsigned paddw mm5,[usteps] // advance u fractions psrlw mm4,4 // shift vfracs to be unsinged paddw mm6,[vsteps] // advance v fractions movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_A: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movd mm0,[esi+ebx*2+1] // lookup texels (0,0,T1,T0) pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. psrlq mm4,32 // Shift high blues down. punpcklwd mm0,[ecx+ebx*2+1] // lookup texels (T3, T1, T2, T0) paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. movq mm3,mm5 // copy u fractions psrld mm2,8 // Shift into final position. movq mm4,mm6 // copy v fractions movd eax,mm2 // Get pixel value. psrlw mm3,4 // shift ufracs to be unsigned mov [edx + edi*2 - 4],ax // Store pixel 1 paddw mm5,[usteps] // advance u fractions nop // for cache alignment psrlw mm4,4 // shift vfracs to be unsinged paddw mm6,[vsteps] // advance v fractions movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_B: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movq mm0,[qUVStep] // Load texture step. pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. paddd mm7,mm0 // Step U,V psrlq mm4,32 // Shift high blues down. paddd mm2,mm4 // Sum blues. (X, B) movq mm4,mm7 // Copy U1,V1 paddd mm7,mm0 // Step U,V pand mm1,mm3 // Mask reds and greens. psrad mm4,16 // Shift for integer U1,V1 movq mm0,mm7 // Copy U2,V2 por mm2,mm1 // Combine blue and red. psrad mm0,16 // Shift for integer U2,V2 packssdw mm4,mm0 // Pack integer texture values. movq mm3,[qMFactor] // Texture multipliers psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. pmaddwd mm4,mm3 // Compute texture indicies. psrld mm2,8 // Shift into final position. movq [qIndexTemp],mm4 // Save texture indicies. movd ebx,mm2 // Get pixel value. mov eax,dword ptr[qIndexTemp] // Load texel index 1. add edi,2 // Inrcement index by 2 mov [edx + edi*2 - 6],bx // Store pixel 2 mov ebx,dword ptr[qIndexTemp+4] // Load texel index 2. jl short INNER_LOOP_2P // Loop (while >2 pixels) FINISH_REMAINDER: // // One pixel (edi == 1) or two pixels (edi == 0) left. // jnz short ONE_PIXEL_LEFT // // Two pixels left, only step texture co-ordinates once. // movd mm0,[esi+eax*2+1] // lookup texels (0,0,T1,T0) movq mm3,mm5 // copy u fractions punpcklwd mm0,[ecx+eax*2+1] // lookup texels (T3, T1, T2, T0) movq mm4,mm6 // copy v fractions psrlw mm3,4 // shift ufracs to be unsigned paddw mm5,[usteps] // advance u fractions psrlw mm4,4 // shift vfracs to be unsinged paddw mm6,[vsteps] // advance v fractions movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_C: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movd mm0,[esi+ebx*2+1] // lookup texels (0,0,T1,T0) pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. psrlq mm4,32 // Shift high blues down. punpcklwd mm0,[ecx+ebx*2+1] // lookup texels (T3, T1, T2, T0) paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. movq mm3,mm5 // copy u fractions psrld mm2,8 // Shift into final position. movq mm4,mm6 // copy v fractions movd eax,mm2 // Get pixel value. psrlw mm3,4 // shift ufracs to be unsigned mov [edx + edi*2 - 4],ax // Store pixel 1 psrlw mm4,4 // shift vfracs to be unsinged movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_D: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) movq mm0,[qUVStep] // Load texture step. pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. paddd mm7,mm0 // Step U,V psrlq mm4,32 // Shift high blues down. paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. psrld mm2,8 // Shift into final position. movd ebx,mm2 // Get pixel value. mov [edx + edi*2 - 2],bx // Store pixel 2 jmp short END_OF_SCANLINE ONE_PIXEL_LEFT: // // One pixel left, don't step texture co-ordinates. // movd mm0,[esi+eax*2+1] // lookup texels (0,0,T1,T0) movq mm3,mm5 // copy u fractions punpcklwd mm0,[ecx+eax*2+1] // lookup texels (T3, T1, T2, T0) movq mm4,mm6 // copy v fractions psrlw mm3,4 // shift ufracs to be unsigned psrlw mm4,4 // shift vfracs to be unsinged movq mm2,mm0 // copy texels pand mm0,[mask_greens] // mask greens pmulhw mm3,mm4 // combine u and v fractions (0x00FF == 1) movq mm1,mm2 // copy texels pand mm2,[mask_blues] // mask blues psrlw mm1,11 // shift reds into position (no need to mask) MODIFY_FOR_RED_SHIFT_E: pmaddwd mm1,mm3 // multiply and add reds (R0+R2, R1+R3) psrlw mm0,5 // shift greens into positon pmaddwd mm0,mm3 // multiply and add greens (G0+G2, G1+G3) movq mm4,[red_green_shift] // Load multiplication factors for shift. pmaddwd mm2,mm3 // multiply and add blues (B0+B2, B1+B3) movq mm3,[red_green_mask] // Load masks for red and green. packssdw mm1,mm0 // Pack reds and greens together (G0,G1,R0,R1) pmaddwd mm1,mm4 // Sum and shift reds and greens. (G, R) movq mm4,mm2 // Copy blues. psrlq mm4,32 // Shift high blues down. paddd mm2,mm4 // Sum blues. (X, B) pand mm1,mm3 // Mask reds and greens. por mm2,mm1 // Combine blue and red. psrlq mm1,32 // Shift green down. por mm2,mm1 // Combine red with blue and green. psrld mm2,8 // Shift into final position. movd eax,mm2 // Get pixel value. mov [edx + edi*2 - 4],ax // Store pixel 1 // ------------------------------------------------------------------------- // Increment the base edge. // END_OF_SCANLINE: mov eax,[pscanGlbl] movq mm1,[pf_texture_inc] movq mm7,[eax]FakeTCopyLinearFilter.indCoord.pfIndex mov ebx,[eax]TCopyLinearFilter.fxXDifference pfadd (m7,m1) add ebx,[fx_diff] jge short NO_UNDERFLOW // Undeflow of fxXDifference. add ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) jmp short EDGE_DONE NO_UNDERFLOW: cmp ebx,0x00010000 jl short EDGE_DONE // Overflow of fxXDifference. sub ebx,0x00010000 movq mm1,[pfNegD] pfadd (m7,m1) EDGE_DONE: // Store modified variables and do looping. // Step length, line starting address. mov esi,[pdtriGlbl] mov [eax]TCopyLinearFilter.fxXDifference,ebx movq [eax]FakeTCopyLinearFilter.indCoord.pfIndex,mm7 mov ebx,[esi]CDrawPolygonBase.fxLineLength.i4Fx mov ecx,[esi]CDrawPolygonBase.iLineStartIndex add ebx,[esi]CDrawPolygonBase.fxDeltaLineLength.i4Fx add ecx,gsGlobals.u4LinePixels mov [esi]CDrawPolygonBase.fxLineLength.i4Fx,ebx mov [esi]CDrawPolygonBase.iLineStartIndex,ecx mov ecx,[eax]TCopyLinearFilter.fxX mov edx,[fx_inc] add ecx,edx mov edx,[esi]CDrawPolygonBase.iY mov [eax]TCopyLinearFilter.fxX,ecx inc edx mov ebx,[esi]CDrawPolygonBase.iYTo mov [esi]CDrawPolygonBase.iY,edx cmp edx,ebx jl Y_LOOP // Done with MMX/3DX. femms } } #pragma code_seg() #endif // if (BILINEAR_FILTER)