Fast Sin and Cos on built-in ASM for Delphi

 
3r3-31. Hello!
 
 
There was a need to write a quick calculation of Sin and Cos. The basis for the calculations took the decomposition of the Taylor series. I use it in 3D systems (OpenGL and a graphical library of my own development). Unfortunately, it’s not possible to reduce the series “perfectly” for Double, but this is offset by good acceleration. The code is written in the assembler built into Delphi XE6. Used by SSE2.
 
 
Not suitable for scientific computing, but quite suitable for use in games.
 
There is enough accuracy to cover the digit capacity of the Single number, which is used
 
for multiplication by a matrix.
 
 
In summary:
 
 
 
3r3333. The achieved accuracy of the result is: 10.e-13
 
3r3333. The maximum discrepancy with the CPU is ???. 3r3334.  
3r3333. The speed is increased in comparison with the CPU by ??? times. 3r3334.  
3r3333. Speed ​​increased in comparison with Math.Sin and Math.Cos by 2.6 times. 3r3334.  
 
For the test I used an Intel Core-i???X Extreme 3.0 GHz processor.
 
The source code for Delphi is embedded in the comments to the assembler.
 
3r33479.
 
Source code:
 
 
Headline spoiler [/b]
3r3355. var
gSinTab: array of Double;
gSinAddr: UInt64;
const
AbsMask64: UInt64 = ($ 7FFFFFFFFFFFFFFF);
PI90: Double = (PI /2.0);
FSinTab: array[07]of Double =
(1.0 /6.? //3! 3r3493. 1.0 /120.? //5! 3r3495. 1.0 /5040.? //7! 3r3495. 1.0 /362880.? //9! 3r3495. 1.0 /39916800.? //11! 3r3r??? /6227020800.? //13! 3r3495; 1.0 /1307674368000.? //15! 3r3495 (1.0 /355687428096000.0); //17!
//Sin function maxima for positive values ​​
QSinMaxP: array[03]of Double = (0.? 1.? 0.? -1.0);
//Sin function maxima for negative values ​​
QSinMaxM: array[03]of Double = (0.? -1.? 0.? 1.0);
//Quadrant signs for positive values ​​
QSinSignP: array[03]Of Double = (1.? 1.? -1.? -1.0);
//Quadrant signs for negative values ​​
QSinSignM: array[03]Of Double = (-1.? -1.? 1.? 1.0);
function Sinf (const A: Double): Double;
asm 3r3495. .NOFRAME
//At the input A to xmm0
//A quarter of a circle
//X: = IntFMod (Abs (A), PI9? D);
//Result: = FNum - (Trunc (FNum /FDen) * FDen);
pxor xmm? xmm3
comisd A, xmm3
jnz @ANZ
ret //Result: = 0.0;
@ANZ:
//Flag of the negative angle
//Minus: = (A < 0.0);
Setc cl
Movsd xmm?[AbsMask64]//Abs (A) 3r3495. Pand A, xr1
Movsd xmm? as found, w2w2w3w3495. 3 , A //Copy A = FNum
Divsd xmm? xmm1 //(FNum /FDen)
Roundsd xmm? xmm? 11b //11b - Trunc
mulsd xmm? xmm1
subsd xmm? xmm2
3r3495. //--------------------------------- -
//Normalization of the quadrant
And rax, 3 //D: = (D and 3);
//----------------- ------------------
//Check for maximum function 3r3495. //if (X = 0.0) then 3r3495. //begin 3r3495. //if Minus then
//Exit (QSinMaxM[D]) else
//Exit (QSinMaxP[D]);
//end;
comisd xmm? xmm33 r3r3495. jnz @XNZ
shl rax, 3 //multiply the quadrant number by the size of Double (8 bytes) 3r3495.
cmp cl, 1 //If the angle is negative, then the transition
je @ maxminus
@MaxPlus:
lea rdx, qword ptr[QSinMaxP]
movsd xmm? qword ptr[rdx + rax]
ret
@ Maxminus:
lea rdx, qword ptr[QSinMaxM]
movsd xmm? qword ptr[rdx + rax]
ret
//-----------------------------------
@XNZ:
//For an odd quadrant, subtract degrees for symmetry
//if (D and 1) <> 0 then X := (PI90 - X);
mov edx, eax
and edx, 1
cmp edx, 0
je @DZ
subsd xmm? xmm0
movsd xmm? xmm1
//-----------------------------------
@DZ:
//Result remains in xmm0
//X in xmm0
//N: = (X * X) in xmm2
//F: = (N * X) in xmm1
//N
movsd xmm? xmm0 //Copy X
mulsd xmm? xmm2 //N: = (X * X)
//F
movsd xmm? xmm2 //copy N
mulsd xmm? xmm0 //F: = (N * X)
//Load factorial table for sine
mov rdx,[gSinTab]
movaps xmm? dqword ptr[rdx]
movaps xmm? dqword ptr[rdx + 16]
movaps xmm? dqword ptr[rdx + 32]
movaps xmm1? dqword ptr[rdx + 48]
//Extract the odd part of the table
movhlps xmm? xmm4
movhlps xmm? xmm6
movhlps xmm? xmm8
movhlps xmm1? xmm10
//Result: = X - F * PDouble (gSinAddr) ^;
mulsd xmm? xmm1 //FSinTab[0]
subsd xmm? xmm4
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 8) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[1]
addsd xmm? xmm5
//F: = (F * N);
//Result: = Result - F * PDouble (gSinAddr + 16) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[2]
subsd xmm? xmm6
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 24) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[3]
addsd xmm? xmm7
//F: = (F * N);
//Result: = Result - F * PDouble (gSinAddr + 32) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[4]
subsd xmm? xmm8
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 40) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[5]
addsd xmm? xmm9
//F: = (F * N);
//Result: = Result - F * PDouble (gSinAddr + 48) ^;
mulsd xmm? xmm2
mulsd xmm1? xmm1 //FSinTab[6]
subsd xmm? xmm10
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 56) ^;
mulsd xmm? xmm2
mulsd xmm1? xmm1 //FSinTab[7]
addsd xmm? xmm11
//-----------------------------------
//if Minus then
//Result: = (Result * QSinSignM[D]) Else
//Result: = (Result * QSinSignP[D]);
shl rax, 3 //Multiply by 8
cmp cl, 1
je @ minus
lea rdx, qword ptr[QSinSignP]
mulsd xmm? qword ptr[rdx + rax]
ret
@Minus:
lea rdx, qword ptr[QSinSignM]
mulsd xmm? qword ptr[rdx + rax]
end;
//------------------------------------
function Cosf (const A: Double): Double;
asm 3r3495. .NOFRAME
//A in xmm0
//A quarter of a circle
//X: = IntFMod (AMod, PI9? D);
//Result: = FNum - (Trunc (FNum /FDen) * FDen);
pxor xmm? xmm3
comisd A, xmm3
jnz @ANZ
movsd xmm?[DoubleOne]
ret //Result: = 1.0;
//-----------------------------------
@ANZ:
//Flag of the negative angle
//Minus: = (A < 0.0);
Setc cl
Movsd xmm?[AbsMask64]//Abs (A)
Pand A, xmm1
//------------ -----------------------
Movsd xmm?[PI90]//PI90 = FDen
//------- ----------------------------
//if Minus then
//AMod: = Abs (A) - PI90 else
//AMod: = Abs (A) + PI90;
cmp cl, 1
je @ SubPI90
3rr3495. subsdA, xmm1
//-----------------------------------
@ IntFMod:
Movsd xmm? A //Copy A = FNum
Divsd xmm? xmm1 //(FNum /FDen)
Roundsd xmm? xmm? 11b //11b - Trun
(X in EAX)
Mulsd xmm? xmm1
Subsd xmm? xmm2
//------------------- --------------------
//Normalizing the quadrant
and rax, 3 //D: = (D and 3);
//-----------------------------------
//Check for maximum function
//if (X = 0.0) then
//begin
//if Minus then
//Exit (QSinMaxM[D]) Else
//Exit (QSinMaxP[D]);
//end;
comisd xmm? xmm3
jnz @XNZ
shl rax, 3 //multiply the quadrant number by size Double (8 bytes)
cmp cl, 1 //If the angle is negative, then the transition
je @ maxminus
@MaxPlus:
lea rdx, qword ptr[QSinMaxP]
movsd xmm? qword ptr[rdx + rax]
ret
@ Maxminus:
lea rdx, qword ptr[QSinMaxM]
movsd xmm? qword ptr[rdx + rax]
ret
//-----------------------------------
@XNZ:
//For an odd quadrant, subtract degrees for symmetry
//if (D and 1) <> 0 then X := (PI90 - X);
mov edx, eax
and edx, 1
cmp edx, 0
je @DZ
subsd xmm? xmm0
movsd xmm? xmm1
@DZ:
//Result remains in xmm0
//X in xmm0
//N: = (X * X) in xmm2
//F: = (N * X) in xmm1
//N
movsd xmm?xmm0 //copy x
mulsd xmm? xmm2 //N: = (X * X)
//F
movsd xmm? xmm2 //copy N
mulsd xmm? xmm0 //F: = (N * X)
//Load factorial table for sine
mov rdx,[gSinTab]
movaps xmm? dqword ptr[rdx]
movaps xmm? dqword ptr[rdx + 16]
movaps xmm? dqword ptr[rdx + 32]
movaps xmm1? dqword ptr[rdx + 48]
//Extract the odd part of the table
movhlps xmm? xmm4
movhlps xmm? xmm6
movhlps xmm? xmm8
movhlps xmm1? xmm10
//Result: = X - F * PDouble (gSinAddr) ^;
mulsd xmm? xmm1 //FSinTab[0]
subsd xmm? xmm4
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 8) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[1]
addsd xmm? xmm5
//F: = (F * N);
//Result: = Result - F * PDouble (gSinAddr + 16) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[2]
subsd xmm? xmm6
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 24) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[3]
addsd xmm? xmm7
//F: = (F * N);
//Result: = Result - F * PDouble (gSinAddr + 32) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[4]
subsd xmm? xmm8
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 40) ^;
mulsd xmm? xmm2
mulsd xmm? xmm1 //FSinTab[5]
addsd xmm? xmm9
//F: = (F * N);
//Result: = Result - F * PDouble (gSinAddr + 48) ^;
mulsd xmm? xmm2
mulsd xmm1? xmm1 //FSinTab[6]
subsd xmm? xmm10
//F: = (F * N);
//Result: = Result + F * PDouble (gSinAddr + 56) ^;
mulsd xmm? xmm2
mulsd xmm1? xmm1 //FSinTab[7]
addsd xmm? xmm11
//-----------------------------------
//if Minus then
//Result: = (Result * QSinSignM[D]) Else
//Result: = (Result * QSinSignP[D]);
shl rax, 3 //Multiply by 8
cmp cl, 1
je @ minus
lea rdx, qword ptr[QSinSignP]
mulsd xmm? qword ptr[rdx + rax]
ret
@Minus:
lea rdx, qword ptr[QSinSignM]
mulsd xmm? qword ptr[rdx + rax]
end;
Initialization 3r3495. //Aligned array
SetLength (gSinTab, 8);
//Address of the table 3r3495. gSinAddr: = UInt64 (@FSinTab[0]);
//Copy the table into the aligned array
Move (FSinTab[0], GSinTab[0], SizeOf (Double) * 8);
Finalization
SetLength (gSinTab, 0);
3r33434. 3r33469.
 
 
3r33476. 3r37777. An example of work here is [/b] 3r33479.
 
 
Constructive suggestions and comments are welcome.
+ 0 -

Add comment